๐Ÿ‘ฉ‍๐Ÿ’ป ๋งํฌํƒ€๊ณ  ๋ธŒ๋žœ๋“œ ํƒ€์ดํ‹€ ํฌ๋กค๋ง

    ๋ฐ˜์‘ํ˜•

     

    import os
    import re
    import pandas as pd
    import pickle
    import collections
    import numpy as np
    import math
    from ast import literal_eval
    from time import gmtime, strftime
    import re
    import time
    from tqdm import tqdm
    from bs4 import BeautifulSoup
    
    # Scrapping
    import selenium
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from fake_useragent import UserAgent
    
    # Error Handling
    import socket
    import urllib3
    import urllib.request
    from urllib.request import urlopen
    from urllib.parse import quote_plus
    from urllib.request import urlretrieve
    from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException
    
    import warnings
    warnings.filterwarnings('ignore')

     

    ์šฐ์„  ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ!
    ๊ธฐ๋ณธ์ ์œผ๋กœ pandas, BeautifulSoup, urllib.request, selenium.webdriver ์ •๋„ ํ•„์š”ํ–ˆ๋‹ค.

    links = pd.read_csv(r"G:\๋‚ด ๋“œ๋ผ์ด๋ธŒ\all_brand_links.csv")
    links['brand_title'] = ''

    ๋งฅ์—์„œ๋Š” ๊ฒฝ๋กœ ๋ณต์‚ฌํ•˜๋Š”๋ฐ์—๋„ ํ•œ์ฐธ ์• ๋จน์—ˆ๋‹ค,,,
    (๋งฅ ๊ฒฝ๋กœ๋ณต์‚ฌ ๋‹จ์ถ•ํ‚ค! : opt + command + c )
    ์จŒ๋“  ๋ณด๋‚ด์ฃผ์‹  ๋งํฌ๋ฅผ ์—ด์—ˆ๊ณ , ๋‚ด๊ฐ€ ํฌ๋กค๋ง ํ•  brand title์„ ๋„ฃ์„ column์„ ์ƒˆ๋กœ ๋งŒ๋“ค๊ณ  ๋นˆ๋ฆฌ์ŠคํŠธ๋ฅผ ๋„ฃ์—ˆ๋‹ค.

    ๊ทผ๋ฐ ์—ฌ๊ธฐ์„œ ๋ณดํ†ต์€ ์ด๋Ÿฐ ์‹์œผ๋กœ ํ•˜์ง€๋Š” ์•Š๋Š”๋‹ค๊ณ  ํ•˜์…จ๋‹ค. ์–ด์จŒ๋“  ''๋„ ๊ฐ’์ด๊ธฐ ๋•Œ๋ฌธ์— ๋‚˜์ค‘์— ์‚ฌ์†Œํ•˜๋”๋ผ๋„ ์ด์Šˆ๊ฐ€ ์ƒ๊ธธ ์ˆ˜ ์žˆ๊ธฐ ๋•Œ๋ฌธ์—,
    ๊ทธ๋ƒฅ ํฌ๋กค๋งํ•œ ๊ฐ’์„ list์— appendํ•˜๋ฉด์„œ ํ•˜๋‚˜์˜ list๋ฅผ ๋งŒ๋“  ๋‹ค์Œ์— dataFrame์— column์œผ๋กœ ๋ถ™์ด๋Š”๊ฒƒ์ด ๋งž๋‹ค๊ณ  ํ•˜์‹ ๋‹ค...!

    wd = webdriver.Chrome(ChromeDriverManager().install())
    
    idx = 0
    error = []
    
    for i in links["brand_links"]:
        try:
            url = i
            wd.get(url)
            time.sleep(3)
            
            wd.find_element_by_xpath("/html/body/div/div/div/div/div[1]/span/div/div[2]/div[2]/button[2]").click()
            time.sleep(3)
            
            html = wd.page_source
            soup = soup = BeautifulSoup(html, "html.parser")
            brand_title = soup.select_one('.details__brand__title').get_text()
            
            links["brand_title"][idx] = brand_title
            
            idx += 1
            print(idx, "๊ฐœ ์™„๋ฃŒ")
            
        except Exception as e: 
            print(i, "์—์„œ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: ", e)
            idx += 1
            error.append(i)
            pass

    ์ผ๋‹จ xpath๋กœ ํŒ์—…์ด ๋œจ๋Š” ๊ฑธ ๊ณ„์† ๋‹ซ์•„์ค˜์•ผํ–ˆ๋‹ค.
    ๊ทธ๋ฆฌ๊ณ , BeautifulSoup์œผ๋กœ ๋ธŒ๋žœ๋“œ ๋ช…์€ ํ•˜๋‚˜์”ฉ๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด ๋์ด์—ˆ๊ธฐ ๋•Œ๋ฌธ์— select_one() ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•˜์˜€๊ณ , text๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด ๋˜๊ธฐ ๋•Œ๋ฌธ์— get_text() ์‚ฌ์šฉํ•˜์˜€๋‹ค.

    idx ๋ณ€์ˆ˜๋กœ ์ธ๋ฑ์Šค๋ฅผ ํ•˜๋‚˜ํ•˜๋‚˜ ๋Š˜๋ ค์ฃผ๋ฉด์„œ ์ˆœ์„œ๋Œ€๋กœ ๋ธŒ๋žœ๋“œ๋ฅผ ๋ถ™์—ฌ์คฌ๋Š”๋ฐ, ์‚ฌ์‹ค ์ด๊ฑด ๊ทธ๋ƒฅ ์ฐจ๋ก€๋Œ€๋กœ ํ•˜๋‚˜ํ•˜๋‚˜ ๋ถ™์—ฌ์ฃผ๋Š” ๊ฑฐ๊ธฐ ๋•Œ๋ฌธ์— ๋ถˆ์•ˆ๋ถˆ์•ˆํ•œ๊ฒŒ ์—†์ง€์•Š์•˜๋‹ค.

    ์ˆœ์„œ๋Œ€๋กœ ๋ฐ›์•„์˜ค๋ฉด์„œ error๊ฐ€ ๋‚˜๋ฉด ๊ทธ ์—๋Ÿฌ๋Š” ๋‚˜์ค‘์— ๋‹ค์‹œ ํ™•์ธํ•  ์ˆ˜ ์žˆ๊ฒŒ error ๋ฆฌ์ŠคํŠธ์— ์–ดํŽœ๋“œํ•ด์ค€ ๋’ค, passํ•œ๋‹ค.

    ์†”์งํžˆ ์ฑ…์—์„œ ํ•˜๋Š”๊ฑฐ ์ œ์™ธํ•˜๊ณ  ์ด๋ ‡๊ฒŒ ํฌ๋กค๋Ÿฌ๋งŒ๋“ค์–ด์„œ ํฌ๋กค๋งํ•˜๋Š”๊ฑฐ ์ฒ˜์Œ์ด์—ˆ๋Š”๋ฐ ๋‚˜๋ฆ„,,, ์ž˜ํ–ˆ๋‹ค๊ณ  ์นญ์ฐฌ๋„ ๋ฐ›์•„์„œ ๋ฟŒ๋“ฏ,,, (๋ถ€๋„๋Ÿฝ์ง€๋งŒ ๊ทธ๋ž˜๋„ ๋ฟŒ๋“ฏํ–ˆ๋‹ค๊ณ ,,)

    ๋‚ด ์ปดํ“จํ„ฐ์—์„œ๋Š” ๋„ˆ๋ฌด ๋Š๋ ค์„œ ๋‹ค์‹œ ํ•ด๋ณผ ๋•Œ๋Š” 20๊ฐœ๋งŒ ํ…Œ์ŠคํŠธ๋กœ ๋Œ๋ ธ๋‹ค.
    ๋Œ๋ฆฌ๋ฉด ์ €๋ ‡๊ฒŒ ipynb์—์„œ ๋‚˜์˜จ ๊ฒƒ์ฒ˜๋Ÿผ ๋‚˜์˜จ๋‹ค.

    ๋ฐ˜์‘ํ˜•

    ๋Œ“๊ธ€