# 영어 번역 없음
import winsound as sd
from bs4 import BeautifulSoup
import pyautogui
import pandas as pd
import re
def beepsound():
fr = 2000 # range : 37 ~ 32767
du = 1000 # 1000 ms ==1second
sd.Beep(fr, du) # winsound.Beep(frequency, duration)
def ytb_info(video_url,channel) :
print(f'{channel}',' 크롤링 시작')
driver = webdriver.Chrome('chromedriver.exe', options= options)
# new_data = {'date': '', 'title': '', 'view': '', 'comment': '', 'like':'', 'dislike':''}
count = 1
#데이터 넣을 리스트
date_list = []
title_list = []
view_list = []
like_list = []
dislike_list = []
comment_list = []
try:
#각 채널별 영상으로 크롤링
for i in range(len(video_url)):
start_url = video_url[i]
print(start_url, end= ' / ')
driver.get(start_url)
driver.implicitly_wait(1.5)
body = driver.find_element_by_tag_name('body')
#댓글 null 값 방지
num_of_pagedowns = 1
while num_of_pagedowns:
body.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
num_of_pagedowns -= 1
driver.implicitly_wait(1)
#크롤링 요소
try :
info = driver.find_element_by_css_selector('.style-scope ytd-video-primary-info-renderer').text.split('\n')
if '인기 급상승 동영상' in info[0] :
info.pop(0)
elif '#' in info[0].split(' ')[0] :
info.pop(0)
title = info[0]
divide = info[1].replace('조회수 ','').replace(',','').split('회')
view = divide[0]
date = divide[1].replace(' ','')
like = info[2]
dislike = info[3]
try:
comment = driver.find_element_by_css_selector('#count > yt-formatted-string > span:nth-child(2)').text.replace(',','')
except:
comment = '댓글x'
#리스트에 추가
title_list.append(title)
view_list.append(view)
date_list.append(date)
like_list.append(like)
dislike_list.append(dislike)
comment_list.append(comment)
# 크롤링 정보 저장
new_data = {'date':date_list, 'title':title_list, 'view':view_list, 'comment': comment_list, 'like':like_list, 'dislike':dislike_list}
df = pd.DataFrame(new_data)
df.to_csv(f'data/{channel}/-{channel}.csv', encoding='utf-8-sig')
except :
continue
# print(title, view, date, like, dislike, comment)
num_of_pagedowns = 1
while num_of_pagedowns:
body.send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
num_of_pagedowns -= 1
#페이지 다운
last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# driver.implicitly_wait(2) #오류남
time.sleep(0.5)
new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_page_height == last_page_height:
break
last_page_height = new_page_height
# driver.implicitly_wait(1)
time.sleep(0.75)
time.sleep(0.5)
# 댓글 크롤링
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
users = soup.select("div#header-author > h3 > #author-text > span")
comments = soup.select("yt-formatted-string#content-text")
user_list=[]
review_list=[]
for i in range(len(users)):
str_tmp = str(users[i].text)
str_tmp = str_tmp.replace('\n', '')
str_tmp = str_tmp.replace('\t', '')
str_tmp = str_tmp.replace(' ','')
str_tmp = str_tmp.replace(' ','')
user_list.append(str_tmp)
str_tmp = str(comments[i].text)
str_tmp = str_tmp.replace('\n', '')
str_tmp = str_tmp.replace('\t', '')
str_tmp = str_tmp.replace(' ', '')
review_list.append(str_tmp)
# 댓글 추가
pd_data = {"ID":user_list, "Comment":review_list}
youtube_pd = pd.DataFrame(pd_data)
title = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…《\》]', '', title)
youtube_pd.to_csv(f"data/{channel}/{title}.csv", encoding = 'utf-8-sig')#,index_col = False)
print('ㅁ',end='')
# 자막 추출
ytb_subtitle(start_url, title)
# 광고 끄기
if count :
# time.sleep(1)
try:
driver.implicitly_wait(0.5)
driver.find_element_by_css_selector("#main > div > ytd-button-renderer").click()
count -=1
except:
continue
except :
driver.quit()
beepsound()
driver.quit()
beepsound()