데이터분석_실무/크롤링실무

정책사이트_데이터수집(실무)

포비용 2024. 4. 1. 01:55
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import time

driver = webdriver.Chrome()
info_lists = list()
for i in range(1,20):
    
    driver.get("https://youth.seoul.go.kr/infoData/sprtInfo/list.do?sprtInfoId=&key=2309130006&pageIndex=" + str(i) + "&orderBy=regYmd+desc&recordCountPerPage=8&sc_ctgry=&sw=&viewType=on&sc_aplyPrdEndYmdUseYn=&cntrLa=37.566761128870546&cntrLo=126.97862963872868&neLat=37.566761128870546&neLng=126.97862963872868&swLat=37.566761128870546&swLng=126.97862963872868&mapLvl=6&sarea=#none")
    elems = driver.find_elements(By.CLASS_NAME,"item-overlay")

    for elem in elems:
        elem.click()
        current_url = driver.current_url    
        res = requests.get(current_url)
        soup = BeautifulSoup(res.content,"html.parser")
        idx = current_url.split("sprtInfoId=")[1]
        policy_name = soup.select_one("div.cont > div.tit > strong").string
        link = soup.select_one("div.btn-group > a")["href"]

        texts = soup.select("ul.info li")

        wello_info = [idx,policy_name,link]

        for text in texts:
            text_len = len(text.text.strip().split())

            if text_len >= 3:
                date_time = text.text.strip().split()[1:]
                date_time = (" ").join(date_time) 
                wello_info.append(date_time)
                
            if "상시" in text.text.strip().split() or "상시 [ 선착순 마감 ]" in text.text.strip().split():
                date_time = text.text.strip().split()[1]
                wello_info.append(date_time)

            if "대상" in text.text.strip().split() and len(text.text.strip().split()) > 1:
                target = text.text.strip().split()[1]
                wello_info.append(target)

            if "대상" in text.text.strip().split() and len(text.text.strip().split()) == 1:
                wello_info.append("None")


            if "담당기관" in text.text.strip().split():
                place = text.text.strip().split()[1]
                wello_info.append(place)

        info_lists.append(wello_info)
        driver.back()
        
    time.sleep(3)