데이터분석_실무/크롤링실무

데이터엔지니어링_Ver6

포비용 2024. 6. 2. 16:12
main_url = 'https://mediahub.seoul.go.kr/news/hometown/hometownNewsList.do'
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(main_url)


cnt = 0
j = 2
ID_list = []

# 페이지 넘기면서 게시글 ID 크롤링
while True:
    try:
        time.sleep(2)
        for i in range(1, 9):
            html = driver.page_source
            soup = bs(html, 'html.parser')
            IDs = soup.select(f'#news_List > ul > li:nth-child({i}) > div.thum > a')
            ID = IDs[0].get('onclick')
            ID = ID.split('\'')[1]
            ID_list.append(ID)
            cnt += 1
            print(cnt)
            if cnt == 20:
                break
        if cnt == 20:
            break
            

    except NoSuchElementException:
        print('마지막 페이지입니다.')
        break       

    try:
        time.sleep(1)
        if j == 8:
            j == 1
        pages = driver.find_element(By.XPATH, "//*[@id='news_List']")
        page = pages.find_element(By.XPATH, f'//*[@id="news_List"]/div/ul/li[{j}]')
        try:
            time.sleep(1)
            page.click()
            j += 1
        except:
            pass
        
    except NoSuchElementException:
        print('마지막 페이지입니다.')
        pass
        
 ### --------------------------------------
 
 set_list = set(ID_list)
driver = webdriver.Chrome()
processed_html = []
for idx, site_id in enumerate(set_list):
    url = "https://mediahub.seoul.go.kr/archives/" + str(site_id)
    print(url)
    driver.get(url) 
    
    driver.execute_script("""
        var utilItems = document.querySelectorAll('div.utilItem');
        utilItems.forEach(function(element) {
            element.remove();
        });

        var asideInners = document.querySelectorAll('div.asideInner');
        asideInners.forEach(function(element) {
            element.remove();
        });

        var newsDetailFoots = document.querySelectorAll('div.news_detail_foot');
        newsDetailFoots.forEach(function(element) {
            element.remove();
        });

        var detailUtilItems = document.querySelectorAll('div.detailUtil');
        detailUtilItems.forEach(function(element) {
            element.remove();
        });

        var flexDirectionNavs = document.querySelectorAll('ul.flex-direction-nav');
        flexDirectionNavs.forEach(function(element) {
            element.remove();
        });

        var slides = document.querySelectorAll('ul.slides');
        slides.forEach(function(element) {
            element.remove();
        });

        var currents = document.querySelectorAll('div.current');
        currents.forEach(function(element) {
            element.remove();
        });
    """)
    
    elem = driver.find_element(By.CLASS_NAME, "newsDetail").get_attribute("outerHTML")
    # url 전처리
    elem = elem.replace('src="/', 'src="https://mediahub.seoul.go.kr/')
    time.sleep(1)
    # HTML 파일로 저장
    with open(f"./save_html/html_save_{site_id}_{idx}.html", "wb") as file:
        file.write(elem.encode('utf-8'))
        
    time.sleep(1)
    soup = BeautifulSoup(elem, "html.parser")
    formatted_data = pprint.pformat(soup, width=40, indent=4)
    processed_html.append(formatted_data)
    images = soup.select("img")
    
    downloaded_images = set()
    
    for img_num, image in enumerate(images):
        image = image["src"]
        if image in downloaded_images:
            continue
            
        downloaded_images.add(image)
        image_content = requests.get(image).content
        
        image = requests.get(image).content
        file_path = f"./seoul_images/{site_id}_{img_num}.png"
        with open(file_path, "wb") as image_data:
            image_data.write(image)
            
    time.sleep(1)