Python

방탄소년단, 블랙핑크 맴버별 사진 다운로드 프로그램 => Crawling BTS & Black Pink member's pictures (with Python, Selenium)

EasyCoding 2021. 1. 4. 21:36
728x90

1. pip install selenium

2. download & copy "chromedriver.exe"

3. Run below code : Python kpop_crawling.py

 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import urllib.request
import os

kpop_dict = {
"BTS" : ["RM", "Jin", "Suga", "J-Hope", "Jimin", "V","Jungkook"],
"Black Pink" : ["Jisoo", "Jennie", "Rosé", "Lisa"]
}

def crawling(target_name):
    driver.get("https://www.google.co.kr/imghp?hl=ko&tab=wi&ogbl")
    elem = driver.find_element_by_name("q")
    elem.send_keys(target_name)
    elem.send_keys(Keys.RETURN)
    SCROLL_PAUSE_TIME = 3  #Increase this number if your network is slow
    NUMBER_OF_PICTURES = 50 #Increase this number if you want to get more pictures
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    
    count = 0
    while count<NUMBER_OF_PICTURES:
    #while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
      

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            try:
                driver.find_element_by_css_selector(".mye4qd").click()
            except:
                break
        last_height = new_height

        images = driver.find_elements_by_css_selector(".rg_i.Q4LuWd")

        

        for image in images:
            try:
                image.click()
                time.sleep(2)
                imgUrl = driver.find_element_by_xpath('/html/body/div[2]/c-wiz/div[3]/div[2]/div[3]/div/div/div[3]/div[2]/c-wiz/div[1]/div[1]/div/div[2]/a/img').get_attribute("src")
                # urllib.request.urlretrieve(imgUrl, os.path.join('./'+ target_name +'/', i + str(count) + ".jpg"))
                urllib.request.urlretrieve(imgUrl, target_name + str(count) + ".jpg")
                count = count+1
                if count>=(NUMBER_OF_PICTURES+1):
                    break
            except:
                pass

    

driver = webdriver.Chrome()
for key in kpop_dict:
    os.mkdir(key)
    os.chdir(key)
    for val in kpop_dict[key]:
        os.mkdir(val)
        os.chdir(val)
        crawling(val)
        os.chdir('..')
    os.chdir('..')
driver.close()