From b584f9a301aaffc7ddc926a057e55951eca5115a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Mon, 16 Feb 2026 13:19:13 +0100 Subject: [PATCH] remplacement: changer le fichiers *main par scraper --- scraper.py | 363 ++++++++++++++++++++++++++++++++++++++++++++++++ test_scraper.py | 326 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 689 insertions(+) create mode 100755 scraper.py create mode 100644 test_scraper.py diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..673bb4b --- /dev/null +++ b/scraper.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 + +from sys import argv +from typing import cast +from requests import HTTPError, Response, Session +from bs4 import BeautifulSoup, Tag +from collections import OrderedDict +from json import JSONDecodeError, loads + + +class _ScraperData: + """_summary_""" + + def __init__(self, data: dict[str, object]) -> None: + """_summary_ + + Args: + data (dict[str, object]): _description_ + """ + self._data: dict[str, object] = data + + def _getcontent(self) -> dict[str, object] | None: + """_summary_ + + Returns: + dict[str, object]: _description_ + """ + current_data: dict[str, object] = self._data + for key in ["initialReduxState", "product", "content"]: + new_data: object | None = current_data.get(key) + if new_data is None: + return None + current_data: dict[str, object] = cast(dict[str, object], new_data) + + return current_data + + def _getattributes(self) -> dict[str, object] | None: + """_summary_ + + Returns: + dict[str, object]: _description_ + """ + current_data: object = self._getcontent() + if current_data is None: + return None + return cast(dict[str, object], current_data.get("attributes")) + + def prix(self) -> float | None: + """ + Retourne le prix unitaire d'une bouteille (75cl). + + Si aucun prix n'est disponible, retourne None. + """ + + content = self._getcontent() + if content is None: + return None + + items = content.get("items") + + # Vérifie que items existe et n'est pas vide + if not isinstance(items, list) or len(items) == 0: + return None + + prix_calcule: float | None = None + + for item in items: + if not isinstance(item, dict): + continue + + p = item.get("offerPrice") + attrs = item.get("attributes", {}) + + nbunit = attrs.get("nbunit", {}).get("value") + equivbtl = attrs.get("equivbtl", {}).get("value") + + if not isinstance(p, (int, float)) or not nbunit or not equivbtl: + continue + + nb = float(nbunit) + eq = float(equivbtl) + + if nb <= 0 or eq <= 0: + continue + + if nb == 1 and eq == 1: + return float(p) + + prix_calcule = round(float(p) / (nb * eq), 2) + + return prix_calcule + + def appellation(self) -> str | None: + """_summary_ + + Returns: + str: _description_ + """ + attrs: dict[str, object] | None = self._getattributes() + + if attrs is not None: + app_dict: object | None = attrs.get("appellation") + if isinstance(app_dict, dict): + return cast(str, app_dict.get("value")) + return None + + def _getcritiques(self, name: str) -> str | None: + """_summary_ + + Args: + name (str): _description_ + + Returns: + str | None: _description_ + """ + + current_value: dict[str, object] | None = self._getattributes() + if current_value is not None: + app_dict: dict[str, object] = cast( + dict[str, object], current_value.get(name) + ) + if not app_dict: + return None + + val = cast(str, app_dict.get("value")).rstrip("+").split("-") + if len(val) > 1 and val[1] != "": + val[0] = str(round((float(val[0]) + float(val[1])) / 2, 1)) + + return val[0] + return None + + def parker(self) -> str | None: + return self._getcritiques("note_rp") + + def robinson(self) -> str | None: + return self._getcritiques("note_jr") + + def suckling(self) -> str | None: + return self._getcritiques("note_js") + + def getdata(self) -> dict[str, object]: + return self._data + + def informations(self) -> str: + """ + Retourne toutes les informations sous la forme : + "Appelation,Parker,J.Robinson,J.Suckling,Prix" + """ + + appellation = self.appellation() + parker = self.parker() + robinson = self.robinson() + suckling = self.suckling() + try: + prix = self.prix() + except ValueError: + prix = None + + return f"{appellation},{parker},{robinson},{suckling},{prix}" + + +class Scraper: + """ + Scraper est une classe qui permet de gerer + de façon dynamique des requetes uniquement + sur le serveur https de Millesima + """ + + def __init__(self) -> None: + """ + Initialise la session de scraping. + """ + self._url: str = "https://www.millesima.fr/" + # Très utile pour éviter de renvoyer toujours les mêmes handshake + # TCP et d'avoir toujours une connexion constante avec le server + self._session: Session = Session() + # Système de cache pour éviter de solliciter le serveur inutilement + self._latest_request: tuple[(str, Response)] | None = None + self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[ + str, BeautifulSoup + ]() + + def _request(self, subdir: str) -> Response: + """ + Effectue une requête GET sur le serveur Millesima. + + Args: + subdir (str): Le sous-répertoire ou chemin de l'URL (ex: "/vins"). + + Returns: + Response: L'objet réponse de la requête. + + Raise: + HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). + """ + target_url: str = self._url + subdir.lstrip("/") + response: Response = self._session.get(url=target_url, timeout=10) + response.raise_for_status() + return response + + def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response: + """ + Récupère la réponse d'une page, en utilisant le cache si possible. + + Args: + subdir (str, optional): Le chemin de la page. + use_cache (bool, optional): Utilise la donnée deja sauvegarder ou + écrase la donnée utilisé avec la nouvelle + + Returns: + Response: L'objet réponse (cache ou nouvelle requête). + + Raise: + HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). + """ + + # si dans le cache, latest_request existe + if use_cache and self._latest_request is not None: + rq_subdir, rq_response = self._latest_request + + # si c'est la meme requete et que use_cache est true, + # on renvoie celle enregistrer + if subdir == rq_subdir: + return rq_response + + request: Response = self._request(subdir) + # on recrée la structure pour le systeme de cache si activer + if use_cache: + self._latest_request = (subdir, request) + + return request + + def getsoup(self, subdir: str, use_cache: bool = True) -> BeautifulSoup: + """ + Récupère le contenu HTML d'une page et le transforme en objet BeautifulSoup. + + Args: + subdir (str, optional): Le chemin de la page. + + Returns: + BeautifulSoup: L'objet parsé pour extraction de données. + + Raise: + HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). + """ + + if use_cache and subdir in self._latest_soups: + return self._latest_soups[subdir] + + markup: str = self.getresponse(subdir).text + soup: BeautifulSoup = BeautifulSoup(markup, features="html.parser") + + if use_cache: + self._latest_soups[subdir] = soup + + if len(self._latest_soups) > 10: + _ = self._latest_soups.popitem(last=False) + + return soup + + def getjsondata(self, subdir: str, id: str = "__NEXT_DATA__") -> _ScraperData: + """ + Extrait les données JSON contenues dans la balise __NEXT_DATA__ du site. + Beaucoup de sites modernes (Next.js) stockent leur état initial dans + une balise + + + """, + ) + + m.get( + "https://www.millesima.fr/poubelle", + text=f""" + + +

POUBELLE

+ + + + """, + ) + + json_data = { + "props": { + "pageProps": { + "initialReduxState": { + "product": { + "content": { + "_id": "J4131/22-11652", + "partnumber": "J4131/22", + "productName": "Nino Negri : 5 Stelle Sfursat 2022", + "productNameForSearch": "Nino Negri : 5 Stelle Sfursat 2022", + "storeId": "11652", + "seoKeyword": "nino-negri-5-stelle-sfursat-2022.html", + "title": "Nino Negri : 5 Stelle Sfursat 2022", + "items": [ + { + "_id": "J4131/22/C/CC/6-11652", + "partnumber": "J4131/22/C/CC/6", + "taxRate": "H", + "listPrice": 842, + "offerPrice": 842, + "seoKeyword": "vin-de-charazade1867.html", + "shortdesc": "Une bouteille du meilleur vin du monde?", + "attributes": { + "promotion_o_n": { + "valueId": "0", + "name": "En promotion", + "value": "Non", + "sequence": 80, + "displayable": "False", + "type": "CHECKBOX", + "isSpirit": False, + }, + "in_stock": { + "valueId": "L", + "name": "En stock", + "value": "Livrable", + "sequence": 65, + "displayable": "true", + "type": "CHECKBOX", + "isSpirit": False, + }, + "equivbtl": { + "valueId": "1", + "name": "equivbtl", + "value": "1", + "isSpirit": False, + }, + "nbunit": { + "valueId": "1", + "name": "nbunit", + "value": "1", + "isSpirit": False, + }, + }, + "stock": 12, + "availability": "2026-02-05", + "isCustomizable": False, + "gtin_cond": "", + "gtin_unit": "", + "stockOrigin": "EUR", + "isPrevSale": False, + } + ], + "attributes": { + "appellation": { + "valueId": "433", + "name": "Appellation", + "value": "Madame-Loïk", + "url": "Madame-loik.html", + "isSpirit": False, + "groupIdentifier": "appellation_433", + }, + "note_rp": { + "valueId": "91", + "name": "Peter Parker", + "value": "91", + "isSpirit": False, + }, + "note_jr": { + "valueId": "17+", + "name": "J. Robinson", + "value": "17+", + "isSpirit": False, + }, + "note_js": { + "valueId": "93-94.5", + "name": "J. cherazade", + "value": "93-94", + "isSpirit": False, + }, + }, + } + } + } + } + } + } + + html_product = f""" + + +

MILLESIMA

+ + + + """ + m.get( + "https://www.millesima.fr/nino-negri-5-stelle-sfursat-2022.html", + text=html_product, + ) + + html_product = f""" + + +

MILLESIMA

+ + + + """ + + list_pleine = f""" + + +

LE WINE

+ + + + """ + + list_vide = f""" + + +

LE WINE

+ + + + """ + + m.get( + "https://www.millesima.fr/wine.html", + complete_qs=False, + response_list=[ + {"text": list_pleine}, + {"text": list_vide}, + ], + ) + + # on return m sans fermer le server qui simule la page + yield m + + +@pytest.fixture +def scraper() -> Scraper: + return Scraper() + + +def test_soup(scraper: Scraper): + vide = scraper.getsoup("") + poubelle = scraper.getsoup("poubelle") + contenu = scraper.getsoup("nino-negri-5-stelle-sfursat-2022.html") + assert vide.find("h1") is None + assert str(poubelle.find("h1")) == "

POUBELLE

" + assert str(contenu.find("h1")) == "

MILLESIMA

" + + +def test_appellation(scraper: Scraper): + vide = scraper.getjsondata("") + poubelle = scraper.getjsondata("poubelle") + contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") + assert vide.appellation() is None + assert poubelle.appellation() is None + assert contenu.appellation() == "Madame-Loïk" + + +def test_fonctionprivee(scraper: Scraper): + vide = scraper.getjsondata("") + poubelle = scraper.getjsondata("poubelle") + contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") + assert vide._getattributes() is not None + assert vide._getattributes() == {} + assert vide._getcontent() is not None + assert vide._getcontent() == {"items": [], "attributes": {}} + assert poubelle._getattributes() is None + assert poubelle._getcontent() is None + assert contenu._getcontent() is not None + assert contenu._getattributes() is not None + + +def test_critiques(scraper: Scraper): + vide = scraper.getjsondata("") + poubelle = scraper.getjsondata("poubelle") + contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") + assert vide.parker() is None + assert vide.robinson() is None + assert vide.suckling() is None + assert vide._getcritiques("test_ts") is None + assert poubelle.parker() is None + assert poubelle.robinson() is None + assert poubelle.suckling() is None + assert poubelle._getcritiques("test_ts") is None + assert contenu.parker() == "91" + assert contenu.robinson() == "17" + assert contenu.suckling() == "93.5" + assert contenu._getcritiques("test_ts") is None + + +def test_prix(scraper: Scraper): + vide = scraper.getjsondata("") + poubelle = scraper.getjsondata("poubelle") + contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") + assert vide.prix() is None + assert poubelle.prix() is None + assert contenu.prix() == 842.0 + + +def test_informations(scraper: Scraper): + contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") + assert contenu.informations() == "Madame-Loïk,91,17,93.5,842.0" + vide = scraper.getjsondata("") + poubelle = scraper.getjsondata("poubelle") + assert vide.informations() == "None,None,None,None,None" + assert poubelle.informations() == "None,None,None,None,None" + + +def test_search(scraper: Scraper): + m = mock_open() + with patch("builtins.open", m): + scraper.getvins("wine.html", "fake_file.csv") + + assert m().write.called + all_writes = "".join(call.args[0] for call in m().write.call_args_list) + assert "Madame-Loïk,91,17,93.5,842.0" in all_writes