Coding/Python

[python] ์œ ํŠœ๋ธŒ ๋Œ“๊ธ€ ํฌ๋กค๋ง(api ์—†์ด)

๊น€์œ ๋‹ˆ์ฝ˜ 2022. 4. 6. 21:12

์œ ํŠœ๋ธŒ ์˜์ƒ์„ ๊ฒ€์ƒ‰ํ•˜์—ฌ, 

๋‚˜์˜ค๋Š” ์˜์ƒ๋“ค์˜ ๋Œ“๊ธ€์„ ๋ชจ๋‘ ์ˆ˜์ง‘ํ•ด์•ผ ํ–ˆ๋‹ค. 

 

์—ฌ๋Ÿฌ ์ฐธ๊ณ  ์ž๋ฃŒ๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์•„๋ž˜์™€ ๊ฐ™์€ ์ฝ”๋“œ๋ฅผ ์™„์„ฑ์‹œ์ผฐ๋‹ค. 

 

 

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import time
import re
import requests
import random
import datetime
#Queue์˜ ๊ธฐ๋ณธ์ ์ธ ๊ธฐ๋Šฅ ๊ตฌํ˜„
class Queue():
    def __init__(self, maxsize):
        self.queue = []
        self.maxsize = maxsize
        
    # Queue์— Data ๋„ฃ์Œ
    def enqueue(self, data):
        self.queue.append(data)

    # Queue์— ๊ฐ€์žฅ ๋จผ์ € ๋“ค์–ด์˜จ Data ๋‚ด๋ณด๋ƒ„
    def dequeue(self):
        dequeue_object = None
        if self.isEmpty():
            print("Queue is Empty")
        else:
            dequeue_object = self.queue[0]
            self.queue = self.queue[1:]
        return dequeue_object
    
    # Queue์— ๊ฐ€์žฅ ๋จผ์ €๋“ค์–ด์˜จ Data return
    def peek(self):
        peek_object = None
        if self.isEmpty():
            print("Queue is Empty")
        else:
            peek_object = self.queue[0]
        return peek_object
    
    # Queue๊ฐ€ ๋น„์–ด์žˆ๋Š”์ง€ ํ™•์ธ
    def isEmpty(self):
        is_empty = False
        if len(self.queue) == 0:
            is_empty = True
        return is_empty
    
    # Queue์˜ Size๊ฐ€ Max Size๋ฅผ ์ดˆ๊ณผํ•˜๋Š”์ง€ ํ™•์ธ
    def isMaxSizeOver(self):
        queue_size = len(self.queue)
        if (queue_size > self.maxsize):
            return False
        else :
            return True

# ํ‚ค์›Œ๋“œ๋ฅผ ๋„ฃ์œผ๋ฉด ๊ฒ€์ƒ‰๋˜๋Š” ์˜์ƒ์˜ ์ œ๋ชฉ, url, ์ •๋ณด๋ฅผ ์ˆ˜์ง‘ํ•˜์—ฌ csv๋กœ ์ €์žฅ
def get_urls_from_youtube_with_keyword(keyword):
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920x1080')
    options.add_argument('disable-gpu')
    options.add_argument('user')
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
    options.add_argument("lang=ko_KR")
    driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
    
    key_word = keyword.replace(' ', '+')
    search_keyword_encode = requests.utils.quote(key_word)
    url = "https://www.youtube.com/results?search_query=" + search_keyword_encode
    driver.get(url)
    last_page_height = driver.execute_script("return document.documentElement.scrollHeight")

    #ํŽ˜์ด์ง€ Open ํ›„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์‹œ๊ฐ„
    time.sleep(5)

    #down the scroll
    body = driver.find_element_by_tag_name('body')
    last_page_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    # max size 50์˜ Queue ์ƒ์„ฑ
    # 0.1sec * 50 = 5sec ๋™์•ˆ Scroll ์—…๋ฐ์ดํŠธ๊ฐ€ ์—†์œผ๋ฉด ์Šคํฌ๋กค ๋‚ด๋ฆฌ๊ธฐ ์ข…๋ฃŒ
    szQ = Queue(50)
    enqueue_count = 0
    
    while True:
        # Scroll ๋‚ด๋ฆฌ๊ธฐ
        driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
        
        # Scroll Height๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ์ฃผ๊ธฐ
        time.sleep(0.1)
        new_page_height = driver.execute_script("return document.documentElement.scrollHeight")
        
        # Queue๊ฐ€ ๊ฝ‰ ์ฐจ๋Š” ๊ฒฝ์šฐ ์Šคํฌ๋กค ๋‚ด๋ฆฌ๊ธฐ ์ข…๋ฃŒ
        if(enqueue_count > szQ.maxsize):
            break
        
        # ์ฒซ Loop ์ˆ˜ํ–‰ (Queue๊ฐ€ ๋น„์–ด์žˆ๋Š” ๊ฒฝ์šฐ) ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
        if(szQ.isEmpty()) :
            szQ.enqueue(new_page_height)
            enqueue_count += 1
            
        # Queue์— ๊ฐ€์žฅ ๋จผ์ € ๋“ค์–ด์˜จ ๋ฐ์ดํ„ฐ์™€ ์ƒˆ๋กœ ์—…๋ฐ์ดํŠธ ๋œ Scroll Height๋ฅผ ๋น„๊ตํ•จ
        # ๊ฐ™์œผ๋ฉด ๊ทธ๋Œ€๋กœ Enqueue, ๋‹ค๋ฅด๋ฉด Queue์˜ ๋ชจ๋“  Data๋ฅผ Dequeue ํ›„ ์ƒˆ๋กœ์šด Scroll Height๋ฅผ Enqueue ํ•จ.    
        else :
            if(szQ.peek() == new_page_height) :
                szQ.enqueue(new_page_height)
                enqueue_count += 1
            else :
                szQ.enqueue(new_page_height)
                for z in range(enqueue_count) :
                    szQ.dequeue()
                enqueue_count = 1
    
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    
    driver.close()

    datas = [a.get('href')  # aํƒœ๊ทธ - href ๊ฐ’ ๊ฐ€์ ธ์˜ค๊ธฐ
            for a in soup.select('a#video-title')  # aํƒœ๊ทธ - Id: video-title ๊ฐ€์ ธ์˜ค๊ธฐ
            if not 'shorts' in a.get('href')]  # 'shorts' ์ œ์™ธํ•˜๊ธฐ
    
    youtube_url ='https://www.youtube.com'
    url_lists = [youtube_url + a for a in datas]

    print("๊ด€๋ จ ์˜์ƒ์€ ์ด {0}๊ฐœ ์ž…๋‹ˆ๋‹ค.".format(len(datas)))
    
    page = 1
    data_list = []
    
    for page_url in tqdm(url_lists):
        
        if __name__=="__main__":
            #set option of selenium
            options = webdriver.ChromeOptions()
            options.add_argument('disable-gpu')
            options.add_argument('user')
            options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
            options.add_argument("lang=ko_KR")
            options.add_argument("headless") #์ฐฝ ์ˆจ๊ธฐ๊ธฐ ์ƒ์„ฑ 
            driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)

            #target of crawling
            
            driver.get(page_url)
            driver.set_window_size(800, 600)

            #ํŽ˜์ด์ง€ Open ํ›„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์‹œ๊ฐ„
            driver.implicitly_wait(20)


            #down the scroll
            body = driver.find_element_by_tag_name('body')
            last_page_height = driver.execute_script("return document.documentElement.scrollHeight")

            # max size 50์˜ Queue ์ƒ์„ฑ
            # 0.1sec * 50 = 5sec ๋™์•ˆ Scroll ์—…๋ฐ์ดํŠธ๊ฐ€ ์—†์œผ๋ฉด ์Šคํฌ๋กค ๋‚ด๋ฆฌ๊ธฐ ์ข…๋ฃŒ
            szQ = Queue(50)
            enqueue_count = 0

            while True:
                # Scroll ๋‚ด๋ฆฌ๊ธฐ
                driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
                time.sleep(0.1)
                new_page_height = driver.execute_script("return document.documentElement.scrollHeight")

                if(enqueue_count > szQ.maxsize):
                    break

                if(szQ.isEmpty()) :
                    szQ.enqueue(new_page_height)
                    enqueue_count += 1
   
                else :
                    if(szQ.peek() == new_page_height) :
                        szQ.enqueue(new_page_height)
                        enqueue_count += 1
                    else :
                        szQ.enqueue(new_page_height)
                        for z in range(enqueue_count) :
                            szQ.dequeue()
                        enqueue_count = 1

            html0 = driver.page_source
            html = BeautifulSoup(html0, 'html.parser')

            yt_title = html.find('h1', {'class':'title style-scope ytd-video-primary-info-renderer'}).text
            uploader = html.find('yt-formatted-string', {'class': 'style-scope ytd-channel-name'}).text
            comments_list = html.findAll('ytd-comment-thread-renderer', {'class':'style-scope ytd-item-section-renderer'})

            print('{0}/{1}ํŽ˜์ด์ง€ ๋Œ“๊ธ€ ์ด {2}๊ฐœ ํฌ๋กค๋ง, url: {3}'.format(url_lists.index(page_url)+1,len(url_lists), len(comments_list), page_url))
            driver.close()
            
            for j in range(len(comments_list)):
                
                comment = comments_list[j].find('yt-formatted-string',{'id':'content-text'}).text
                comment = comment.replace('\n', '') 
                comment = comment.replace('\t', '')


                youtube_id = comments_list[j].find('a', {'id': 'author-text'}).span.text
                youtube_id = youtube_id.replace('\n', '') 
                youtube_id = youtube_id.replace('\t', '') 
                youtube_id = youtube_id.strip()

                raw_date = comments_list[j].find('yt-formatted-string', { 'class': 'published-time-text above-comment style-scope ytd-comment-renderer'})
                date = raw_date.a.text

                try:
                    like_num = comments_list[j].find('span', {'id': 'vote-count-middle'}).text
                    like_num = like_num.replace('\n', '') 
                    like_num = like_num.replace('\t', '')
                    like_num = like_num.strip()

                except: like_num = 0

                data = {'yt_title':yt_title,'youtube_id': youtube_id, 'uploader':uploader, 'comment': comment, 'date': date, 'like_num': like_num, 'page_url': page_url}
                data_list.append(data)
                
            page += 1 
            
        df = pd.DataFrame(data_list, columns=['yt_title','uploader','youtube_id','comment','date','like_num','page_url'])    
        
        csv_name = keyword + datetime.datetime.now().strftime('%Y%m%d') +'.csv'
        df.to_csv("./Excel_Data/"+csv_name, mode = 'w', index=False)
get_urls_from_youtube_with_keyword('๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ')

 

 

์˜ˆ๋ฅผ ๋“ค์–ด, ๋“œ๋ผ๋งˆ ์‚ฌ๋‚ด๋งž์„ ์„ ์ž…๋ ฅํ•˜๋ฉด, 

์‚ฌ๋‚ด๋งž์„  ๊ฒ€์ƒ‰์‹œ ๋‚˜์˜ค๋Š” ์•ฝ 600๊ฐœ์˜ ์˜์ƒ์— ๋Œ€ํ•œ ๋ชจ๋“  ๋Œ“๊ธ€์„ ํฌ๋กค๋งํ•ด์ฃผ๋Š” ์ฝ”๋“œ์ด๋‹ค.