데이터분석_실무/크롤링실무
데이터엔지니어링_Ver6
포비용
2024. 6. 2. 16:12
main_url = 'https://mediahub.seoul.go.kr/news/hometown/hometownNewsList.do'
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(main_url)
cnt = 0
j = 2
ID_list = []
# 페이지 넘기면서 게시글 ID 크롤링
while True:
try:
time.sleep(2)
for i in range(1, 9):
html = driver.page_source
soup = bs(html, 'html.parser')
IDs = soup.select(f'#news_List > ul > li:nth-child({i}) > div.thum > a')
ID = IDs[0].get('onclick')
ID = ID.split('\'')[1]
ID_list.append(ID)
cnt += 1
print(cnt)
if cnt == 20:
break
if cnt == 20:
break
except NoSuchElementException:
print('마지막 페이지입니다.')
break
try:
time.sleep(1)
if j == 8:
j == 1
pages = driver.find_element(By.XPATH, "//*[@id='news_List']")
page = pages.find_element(By.XPATH, f'//*[@id="news_List"]/div/ul/li[{j}]')
try:
time.sleep(1)
page.click()
j += 1
except:
pass
except NoSuchElementException:
print('마지막 페이지입니다.')
pass
### --------------------------------------
set_list = set(ID_list)
driver = webdriver.Chrome()
processed_html = []
for idx, site_id in enumerate(set_list):
url = "https://mediahub.seoul.go.kr/archives/" + str(site_id)
print(url)
driver.get(url)
driver.execute_script("""
var utilItems = document.querySelectorAll('div.utilItem');
utilItems.forEach(function(element) {
element.remove();
});
var asideInners = document.querySelectorAll('div.asideInner');
asideInners.forEach(function(element) {
element.remove();
});
var newsDetailFoots = document.querySelectorAll('div.news_detail_foot');
newsDetailFoots.forEach(function(element) {
element.remove();
});
var detailUtilItems = document.querySelectorAll('div.detailUtil');
detailUtilItems.forEach(function(element) {
element.remove();
});
var flexDirectionNavs = document.querySelectorAll('ul.flex-direction-nav');
flexDirectionNavs.forEach(function(element) {
element.remove();
});
var slides = document.querySelectorAll('ul.slides');
slides.forEach(function(element) {
element.remove();
});
var currents = document.querySelectorAll('div.current');
currents.forEach(function(element) {
element.remove();
});
""")
elem = driver.find_element(By.CLASS_NAME, "newsDetail").get_attribute("outerHTML")
# url 전처리
elem = elem.replace('src="/', 'src="https://mediahub.seoul.go.kr/')
time.sleep(1)
# HTML 파일로 저장
with open(f"./save_html/html_save_{site_id}_{idx}.html", "wb") as file:
file.write(elem.encode('utf-8'))
time.sleep(1)
soup = BeautifulSoup(elem, "html.parser")
formatted_data = pprint.pformat(soup, width=40, indent=4)
processed_html.append(formatted_data)
images = soup.select("img")
downloaded_images = set()
for img_num, image in enumerate(images):
image = image["src"]
if image in downloaded_images:
continue
downloaded_images.add(image)
image_content = requests.get(image).content
image = requests.get(image).content
file_path = f"./seoul_images/{site_id}_{img_num}.png"
with open(file_path, "wb") as image_data:
image_data.write(image)
time.sleep(1)