์ ํ๋ธ ์์์ ๊ฒ์ํ์ฌ,
๋์ค๋ ์์๋ค์ ๋๊ธ์ ๋ชจ๋ ์์งํด์ผ ํ๋ค.
์ฌ๋ฌ ์ฐธ๊ณ ์๋ฃ๋ฅผ ํ์ฉํ์ฌ ์๋์ ๊ฐ์ ์ฝ๋๋ฅผ ์์ฑ์์ผฐ๋ค.
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import time
import re
import requests
import random
import datetime
#Queue์ ๊ธฐ๋ณธ์ ์ธ ๊ธฐ๋ฅ ๊ตฌํ
class Queue():
def __init__(self, maxsize):
self.queue = []
self.maxsize = maxsize
# Queue์ Data ๋ฃ์
def enqueue(self, data):
self.queue.append(data)
# Queue์ ๊ฐ์ฅ ๋จผ์ ๋ค์ด์จ Data ๋ด๋ณด๋
def dequeue(self):
dequeue_object = None
if self.isEmpty():
print("Queue is Empty")
else:
dequeue_object = self.queue[0]
self.queue = self.queue[1:]
return dequeue_object
# Queue์ ๊ฐ์ฅ ๋จผ์ ๋ค์ด์จ Data return
def peek(self):
peek_object = None
if self.isEmpty():
print("Queue is Empty")
else:
peek_object = self.queue[0]
return peek_object
# Queue๊ฐ ๋น์ด์๋์ง ํ์ธ
def isEmpty(self):
is_empty = False
if len(self.queue) == 0:
is_empty = True
return is_empty
# Queue์ Size๊ฐ Max Size๋ฅผ ์ด๊ณผํ๋์ง ํ์ธ
def isMaxSizeOver(self):
queue_size = len(self.queue)
if (queue_size > self.maxsize):
return False
else :
return True
# ํค์๋๋ฅผ ๋ฃ์ผ๋ฉด ๊ฒ์๋๋ ์์์ ์ ๋ชฉ, url, ์ ๋ณด๋ฅผ ์์งํ์ฌ csv๋ก ์ ์ฅ
def get_urls_from_youtube_with_keyword(keyword):
options = webdriver.ChromeOptions()
options.add_argument('window-size=1920x1080')
options.add_argument('disable-gpu')
options.add_argument('user')
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR")
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
key_word = keyword.replace(' ', '+')
search_keyword_encode = requests.utils.quote(key_word)
url = "https://www.youtube.com/results?search_query=" + search_keyword_encode
driver.get(url)
last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
#ํ์ด์ง Open ํ ๊ธฐ๋ค๋ฆฌ๋ ์๊ฐ
time.sleep(5)
#down the scroll
body = driver.find_element_by_tag_name('body')
last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
# max size 50์ Queue ์์ฑ
# 0.1sec * 50 = 5sec ๋์ Scroll ์
๋ฐ์ดํธ๊ฐ ์์ผ๋ฉด ์คํฌ๋กค ๋ด๋ฆฌ๊ธฐ ์ข
๋ฃ
szQ = Queue(50)
enqueue_count = 0
while True:
# Scroll ๋ด๋ฆฌ๊ธฐ
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# Scroll Height๋ฅผ ๊ฐ์ ธ์ค๋ ์ฃผ๊ธฐ
time.sleep(0.1)
new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
# Queue๊ฐ ๊ฝ ์ฐจ๋ ๊ฒฝ์ฐ ์คํฌ๋กค ๋ด๋ฆฌ๊ธฐ ์ข
๋ฃ
if(enqueue_count > szQ.maxsize):
break
# ์ฒซ Loop ์ํ (Queue๊ฐ ๋น์ด์๋ ๊ฒฝ์ฐ) ์์ธ ์ฒ๋ฆฌ
if(szQ.isEmpty()) :
szQ.enqueue(new_page_height)
enqueue_count += 1
# Queue์ ๊ฐ์ฅ ๋จผ์ ๋ค์ด์จ ๋ฐ์ดํฐ์ ์๋ก ์
๋ฐ์ดํธ ๋ Scroll Height๋ฅผ ๋น๊ตํจ
# ๊ฐ์ผ๋ฉด ๊ทธ๋๋ก Enqueue, ๋ค๋ฅด๋ฉด Queue์ ๋ชจ๋ Data๋ฅผ Dequeue ํ ์๋ก์ด Scroll Height๋ฅผ Enqueue ํจ.
else :
if(szQ.peek() == new_page_height) :
szQ.enqueue(new_page_height)
enqueue_count += 1
else :
szQ.enqueue(new_page_height)
for z in range(enqueue_count) :
szQ.dequeue()
enqueue_count = 1
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'lxml')
driver.close()
datas = [a.get('href') # aํ๊ทธ - href ๊ฐ ๊ฐ์ ธ์ค๊ธฐ
for a in soup.select('a#video-title') # aํ๊ทธ - Id: video-title ๊ฐ์ ธ์ค๊ธฐ
if not 'shorts' in a.get('href')] # 'shorts' ์ ์ธํ๊ธฐ
youtube_url ='https://www.youtube.com'
url_lists = [youtube_url + a for a in datas]
print("๊ด๋ จ ์์์ ์ด {0}๊ฐ ์
๋๋ค.".format(len(datas)))
page = 1
data_list = []
for page_url in tqdm(url_lists):
if __name__=="__main__":
#set option of selenium
options = webdriver.ChromeOptions()
options.add_argument('disable-gpu')
options.add_argument('user')
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
options.add_argument("lang=ko_KR")
options.add_argument("headless") #์ฐฝ ์จ๊ธฐ๊ธฐ ์์ฑ
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
#target of crawling
driver.get(page_url)
driver.set_window_size(800, 600)
#ํ์ด์ง Open ํ ๊ธฐ๋ค๋ฆฌ๋ ์๊ฐ
driver.implicitly_wait(20)
#down the scroll
body = driver.find_element_by_tag_name('body')
last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
# max size 50์ Queue ์์ฑ
# 0.1sec * 50 = 5sec ๋์ Scroll ์
๋ฐ์ดํธ๊ฐ ์์ผ๋ฉด ์คํฌ๋กค ๋ด๋ฆฌ๊ธฐ ์ข
๋ฃ
szQ = Queue(50)
enqueue_count = 0
while True:
# Scroll ๋ด๋ฆฌ๊ธฐ
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(0.1)
new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
if(enqueue_count > szQ.maxsize):
break
if(szQ.isEmpty()) :
szQ.enqueue(new_page_height)
enqueue_count += 1
else :
if(szQ.peek() == new_page_height) :
szQ.enqueue(new_page_height)
enqueue_count += 1
else :
szQ.enqueue(new_page_height)
for z in range(enqueue_count) :
szQ.dequeue()
enqueue_count = 1
html0 = driver.page_source
html = BeautifulSoup(html0, 'html.parser')
yt_title = html.find('h1', {'class':'title style-scope ytd-video-primary-info-renderer'}).text
uploader = html.find('yt-formatted-string', {'class': 'style-scope ytd-channel-name'}).text
comments_list = html.findAll('ytd-comment-thread-renderer', {'class':'style-scope ytd-item-section-renderer'})
print('{0}/{1}ํ์ด์ง ๋๊ธ ์ด {2}๊ฐ ํฌ๋กค๋ง, url: {3}'.format(url_lists.index(page_url)+1,len(url_lists), len(comments_list), page_url))
driver.close()
for j in range(len(comments_list)):
comment = comments_list[j].find('yt-formatted-string',{'id':'content-text'}).text
comment = comment.replace('\n', '')
comment = comment.replace('\t', '')
youtube_id = comments_list[j].find('a', {'id': 'author-text'}).span.text
youtube_id = youtube_id.replace('\n', '')
youtube_id = youtube_id.replace('\t', '')
youtube_id = youtube_id.strip()
raw_date = comments_list[j].find('yt-formatted-string', { 'class': 'published-time-text above-comment style-scope ytd-comment-renderer'})
date = raw_date.a.text
try:
like_num = comments_list[j].find('span', {'id': 'vote-count-middle'}).text
like_num = like_num.replace('\n', '')
like_num = like_num.replace('\t', '')
like_num = like_num.strip()
except: like_num = 0
data = {'yt_title':yt_title,'youtube_id': youtube_id, 'uploader':uploader, 'comment': comment, 'date': date, 'like_num': like_num, 'page_url': page_url}
data_list.append(data)
page += 1
df = pd.DataFrame(data_list, columns=['yt_title','uploader','youtube_id','comment','date','like_num','page_url'])
csv_name = keyword + datetime.datetime.now().strftime('%Y%m%d') +'.csv'
df.to_csv("./Excel_Data/"+csv_name, mode = 'w', index=False)
get_urls_from_youtube_with_keyword('๊ฒ์ ํค์๋')
์๋ฅผ ๋ค์ด, ๋๋ผ๋ง ์ฌ๋ด๋ง์ ์ ์ ๋ ฅํ๋ฉด,
์ฌ๋ด๋ง์ ๊ฒ์์ ๋์ค๋ ์ฝ 600๊ฐ์ ์์์ ๋ํ ๋ชจ๋ ๋๊ธ์ ํฌ๋กค๋งํด์ฃผ๋ ์ฝ๋์ด๋ค.
'Coding > Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[python] ์ ํ๋ธ ๋๊ธ ํฌ๋กค๋ง(api ์ฌ์ฉ) (0) | 2022.04.06 |
---|---|
[Python] Numpy (0) | 2022.03.18 |
[Python] ๋ฐ์ดํฐ ์๊ฐํ (0) | 2022.03.16 |
[Python] ์๋ฌ : python-3.x Couldn't find Class NSProcessInfo (0) | 2022.03.16 |
[Python] Module (0) | 2022.03.07 |