콘솔워크

selenium으로 network 탭에서 호출된 특정 url 의 데이터 가져오기 본문

프로그래밍/python

selenium으로 network 탭에서 호출된 특정 url 의 데이터 가져오기

콘솔워크 2023. 12. 11. 17:23
반응형

전체코드는 다음과같다.

if 1 == 1:
    import sys
    import os

    sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))


import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import os
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver import DesiredCapabilities
import json
import urllib.parse
from common.utils import *


class PartKeywordsFeature:
    def __init__(self):
        # Chrome DevTools Protocol을 사용하여 네트워크 이벤트를 캡처하기 위한 크롬 옵션 설정
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # 헤드리스 모드로 실행 (창이 표시되지 않음)
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-software-rasterizer")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-browser-side-navigation")
        chrome_options.add_argument("--disable-infobars")
        chrome_options.add_argument("--disable-notifications")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-default-apps")
        chrome_options.add_argument("--disable-offer-store-unmasked-wallet-cards")
        chrome_options.add_argument("--disable-offer-upload-credit-cards")
        chrome_options.add_argument("--disable-precise-memory-info")
        chrome_options.add_argument("--disable-universal-acceleration")
        chrome_options.add_argument("--disable-canvas-aa-msaa")
        chrome_options.add_argument("--disable-composited-antialiasing")
        chrome_options.add_argument("--disable-composited-antialiasing-webgl")
        chrome_options.add_argument("--disable-3d-apis")

        # Chrome WebDriver 설정
        capabilities = DesiredCapabilities.CHROME
        capabilities["loggingPrefs"] = {"performance": "ALL"}  # newer: goog:loggingPrefs
        capabilities["goog:loggingPrefs"] = {"performance": "ALL"}

        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()), options=chrome_options, desired_capabilities=capabilities
        )

        self.default_wait = 5
        self.driver.implicitly_wait(self.default_wait)

    def __del__(self):
        self.driver.quit()

    def process_browser_log_entry(self, entry):
        response = json.loads(entry["message"])["message"]
        return response

    def get_part_keywords(self, search_keyword: str, search_type: str):
        terms_group = []
        driver = self.driver
        search_keyword = urllib.parse.quote(search_keyword)
        search_query = encode_url("https://search.shopping.naver.com/search/all?query=" + search_keyword)
        search_query = search_query.replace(" ", "%20")
        print(search_query)

        driver.get(search_query)
        time.sleep(2)

        # 네이버페이 또는 가격비교만 보기 클릭

        try:
            naver_pay_filter_button = driver.find_element(
                By.XPATH,
                f'//a[contains(text(), "{search_type}")][contains(@class, "filter")]',
            )
            driver.execute_script("arguments[0].click();", naver_pay_filter_button)

            time.sleep(1)

            if search_type == "네이버페이":
                only_naverpay_product_on_button = driver.find_element(
                    By.CSS_SELECTOR,
                    'a[data-nclick="N=a:opy.npaypluson"]',
                )
                driver.execute_script("arguments[0].click();", only_naverpay_product_on_button)
        except Exception as e:
            global_log_append(f"네이버페이, 가격비교 선택실패 {search_keyword}")
            global_log_append(e)

        time.sleep(3)

        # extract requests from logs
        logs_raw = driver.get_log("performance")
        logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

        def log_filter(log_):
            return (
                # is an actual response
                log_["method"] == "Network.responseReceived"
                # and json
                and "json" in log_["params"]["response"]["mimeType"]
            )

        terms = []
        for log in filter(log_filter, logs):
            request_id = log["params"]["requestId"]
            resp_url = log["params"]["response"]["url"]

            if str(resp_url).find("all?adQuery") == -1:
                continue

            response_body = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            response_body = json.loads(response_body["body"])
            terms = response_body["shoppingResult"]["terms"]

            if len(terms) > 0:
                terms_group = [terms[i : i + 2] for i in range(len(terms) - 1)]
            break

        return terms_group

    # 네트워크 쿼리 PreviewalladQuery shoping Result


if __name__ == "__main__":
    terms = PartKeywordsFeature().get_part_keywords("편안한잠옷하기스", "네이버페이")
    print(terms)

 

 

코드해석

 

driver 만들 때 performance 부분을 가져올지 정하는 부분이 제일 중요하다.

        # Chrome WebDriver 설정
        capabilities = DesiredCapabilities.CHROME
        capabilities["loggingPrefs"] = {"performance": "ALL"}  # newer: goog:loggingPrefs
        capabilities["goog:loggingPrefs"] = {"performance": "ALL"}

        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()), options=chrome_options, desired_capabilities=capabilities
        )

 

 

 

driver에서 get_log 함수를 통해 performance에 해당하는 모든 로그들을 가져온다.

그리고 내가 필터링 하고 싶은 url은 "all?adQuery" 부분이어서 이 부분을 발췌해온다.


        # extract requests from logs
        logs_raw = driver.get_log("performance")
        logs = [json.loads(lr["message"])["message"] for lr in logs_raw]

        def log_filter(log_):
            return (
                # is an actual response
                log_["method"] == "Network.responseReceived"
                # and json
                and "json" in log_["params"]["response"]["mimeType"]
            )

        terms = []
        for log in filter(log_filter, logs):
            request_id = log["params"]["requestId"]
            resp_url = log["params"]["response"]["url"]

            if str(resp_url).find("all?adQuery") == -1:
                continue

            response_body = driver.execute_cdp_cmd("Network.getResponseBody", {"requestId": request_id})
            response_body = json.loads(response_body["body"])
            terms = response_body["shoppingResult"]["terms"]

 

반응형