diff --git a/main.py b/main.py index a2cd2e2..416489d 100644 --- a/main.py +++ b/main.py @@ -3,30 +3,6 @@ from typing import Any from bs4 import BeautifulSoup import json -url = "louis-latour-aloxe-corton-1er-cru-les-chaillots-2018.html" -# response = requests.get(url) -# soup = BeautifulSoup(response.text, 'html.parser') - - -class MillesimaSoup(BeautifulSoup): - def __init__(self, markup="", features="html.parser", *args, **kwargs): - super().__init__(markup, features, *args, **kwargs) - - self._json_data = self._extract_json_data() - - def _extract_json_data(self) -> dict[str, Any]: - script = self.find("script", id="__NEXT_DATA__") - - if script and script.string: - try: - data: dict[str, Any] = json.loads(script.string) - for element in ['props', 'pageProps', 'initialReduxState', 'product', 'content']: - data.get(element) - return data - except json.decoder.JSONDecodeError: - return {} - return {} - class Scraper: """ @@ -47,7 +23,7 @@ class Scraper: # TCP et d'avoir toujours une connexion constante avec le server self._session: requests.Session = requests.Session() self._url: str = "https://www.millesima.fr/" - self._soup = None + self._soup = self.getsoup() def _request(self, subdir: str, use_cache: bool = True) -> requests.Response | requests.HTTPError: """ @@ -71,11 +47,11 @@ class Scraper: if self._response.url == target_url: return self._response - self._response: requests.Response = self._session.get(target_url, timeout=10) + self._response: requests.Response = self._session.get( + target_url, timeout=10) self._response.raise_for_status() return self._response - def getsoup(self, subdir: str = "/") -> BeautifulSoup: """ @@ -85,25 +61,19 @@ class Scraper: :return: Description :rtype: BeautifulSoup """ - self._request(subdir) - self._soup = BeautifulSoup(self._response.text, "html.parser") + if subdir != None: + self._request(subdir) + self._soup = BeautifulSoup(self._response.text, "html.parser") return self._soup - - - -print(Scraper().getsoup(url)) - -# # On cible la balise magique -# script_tag = soup.find('script', id='__NEXT_DATA__') -# print(script_tag) - -# if script_tag: -# # On transforme le texte en vrai dictionnaire Python -# data = json.loads(script_tag.string) -# # Navigation dans l'objet (Next.js structure toujours comme ça) -# product_info = data['props']['pageProps']['initialReduxState']['product']['content'] - -# print(f"Vin : {product_info['productName']}") -# print(f"Prix HT : {product_info['items'][0]['htPrice']} €") -# print(f"Stock : {product_info['items'][0]['stock']}") + def get_json_data(self): + script = self._soup.find("script", id="__NEXT_DATA__") + if script and script.string: + try: + data: dict[str, Any] = json.loads(script.string) + for element in ['props', 'pageProps', 'initialReduxState', 'product', 'content']: + data.get(element) + return data + except json.decoder.JSONDecodeError: + pass + return {} diff --git a/test_main.py b/test_main.py index 4af0b52..645bf07 100644 --- a/test_main.py +++ b/test_main.py @@ -1,4 +1,6 @@ from main import * +scraper = Scraper() + def test_soup(): - assert getsoup("https://example.com").find('h1').text == "Example Domain" + assert scraper.getsoup().find('h1').text[3:12] == "MILLESIMA"