From 829c303e7847c03edf82334642fd4a38c678b7b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= Date: Mon, 16 Feb 2026 13:36:17 +0100 Subject: [PATCH] ajout: debut question 8 --- main.py | 366 ++--------------------------------------------- requirements.txt | 11 +- test_main.py | 326 ----------------------------------------- 3 files changed, 17 insertions(+), 686 deletions(-) mode change 100644 => 100755 main.py diff --git a/main.py b/main.py old mode 100644 new mode 100755 index 6549eab..f16edc5 --- a/main.py +++ b/main.py @@ -1,358 +1,14 @@ -from typing import cast -from requests import HTTPError, Response, Session -from bs4 import BeautifulSoup, Tag -from collections import OrderedDict -from json import JSONDecodeError, loads - - -class _ScraperData: - """_summary_ - """ - def __init__(self, data: dict[str, object]) -> None: - """_summary_ - - Args: - data (dict[str, object]): _description_ - """ - self._data: dict[str, object] = data - - def _getcontent(self) -> dict[str, object] | None: - """_summary_ - - Returns: - dict[str, object]: _description_ - """ - current_data: dict[str, object] = self._data - for key in ["initialReduxState", "product", "content"]: - new_data: object | None = current_data.get(key) - if new_data is None: - return None - current_data: dict[str, object] = cast(dict[str, object], new_data) - - return current_data - - def _getattributes(self) -> dict[str, object] | None: - """_summary_ - - Returns: - dict[str, object]: _description_ - """ - current_data: object = self._getcontent() - if current_data is None: - return None - return cast(dict[str, object], current_data.get("attributes")) - - def prix(self) -> float | None: - """ - Retourne le prix unitaire d'une bouteille (75cl). - - Si aucun prix n'est disponible, retourne None. - """ - - content = self._getcontent() - if content is None: - return None - - items = content.get("items") - - # Vérifie que items existe et n'est pas vide - if not isinstance(items, list) or len(items) == 0: - return None - - prix_calcule: float | None = None - - for item in items: - if not isinstance(item, dict): - continue - - p = item.get("offerPrice") - attrs = item.get("attributes", {}) - - nbunit = attrs.get("nbunit", {}).get("value") - equivbtl = attrs.get("equivbtl", {}).get("value") - - if not isinstance(p, (int, float)) or not nbunit or not equivbtl: - continue - - nb = float(nbunit) - eq = float(equivbtl) - - if nb <= 0 or eq <= 0: - continue - - if nb == 1 and eq == 1: - return float(p) - - prix_calcule = round(float(p) / (nb * eq), 2) - - return prix_calcule - - def appellation(self) -> str | None: - """_summary_ - - Returns: - str: _description_ - """ - attrs: dict[str, object] | None = self._getattributes() - - if attrs is not None: - app_dict: object | None = attrs.get("appellation") - if isinstance(app_dict, dict): - return cast(str, app_dict.get("value")) - return None - - def _getcritiques(self, name: str) -> str | None: - """_summary_ - - Args: - name (str): _description_ - - Returns: - str | None: _description_ - """ - - current_value: dict[str, object] | None = self._getattributes() - if current_value is not None: - app_dict: dict[str, object] = cast( - dict[str, object], current_value.get(name) - ) - if not app_dict: - return None - - val = cast(str, app_dict.get("value")).rstrip("+").split("-") - if len(val) > 1 and val[1] != "": - val[0] = str(round((float(val[0]) + float(val[1])) / 2, 1)) - - return val[0] - return None - - def parker(self) -> str | None: - return self._getcritiques("note_rp") - - def robinson(self) -> str | None: - return self._getcritiques("note_jr") - - def suckling(self) -> str | None: - return self._getcritiques("note_js") - - def getdata(self) -> dict[str, object]: - return self._data - - def informations(self) -> str: - """ - Retourne toutes les informations sous la forme : - "Appelation,Parker,J.Robinson,J.Suckling,Prix" - """ - - appellation = self.appellation() - parker = self.parker() - robinson = self.robinson() - suckling = self.suckling() - try: - prix = self.prix() - except ValueError: - prix = None - - return f"{appellation},{parker},{robinson},{suckling},{prix}" - - -class Scraper: - """ - Scraper est une classe qui permet de gerer - de façon dynamique des requetes uniquement - sur le serveur https de Millesima - """ - - def __init__(self) -> None: - """ - Initialise la session de scraping. - """ - self._url: str = "https://www.millesima.fr/" - # Très utile pour éviter de renvoyer toujours les mêmes handshake - # TCP et d'avoir toujours une connexion constante avec le server - self._session: Session = Session() - # Système de cache pour éviter de solliciter le serveur inutilement - self._latest_request: tuple[(str, Response)] | None = None - self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[ - str, BeautifulSoup - ]() - - def _request(self, subdir: str) -> Response: - """ - Effectue une requête GET sur le serveur Millesima. - - Args: - subdir (str): Le sous-répertoire ou chemin de l'URL (ex: "/vins"). - - Returns: - Response: L'objet réponse de la requête. - - Raise: - HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). - """ - target_url: str = self._url + subdir.lstrip("/") - response: Response = self._session.get(url=target_url, timeout=10) - response.raise_for_status() - return response - - def getresponse(self, subdir: str = "", use_cache: bool = True) -> Response: - """ - Récupère la réponse d'une page, en utilisant le cache si possible. - - Args: - subdir (str, optional): Le chemin de la page. - use_cache (bool, optional): Utilise la donnée deja sauvegarder ou - écrase la donnée utilisé avec la nouvelle - - Returns: - Response: L'objet réponse (cache ou nouvelle requête). - - Raise: - HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). - """ - - # si dans le cache, latest_request existe - if use_cache and self._latest_request is not None: - rq_subdir, rq_response = self._latest_request - - # si c'est la meme requete et que use_cache est true, - # on renvoie celle enregistrer - if subdir == rq_subdir: - return rq_response - - request: Response = self._request(subdir) - # on recrée la structure pour le systeme de cache si activer - if use_cache: - self._latest_request = (subdir, request) - - return request - - def getsoup(self, subdir: str, use_cache: bool = True) -> BeautifulSoup: - """ - Récupère le contenu HTML d'une page et le transforme en objet BeautifulSoup. - - Args: - subdir (str, optional): Le chemin de la page. - - Returns: - BeautifulSoup: L'objet parsé pour extraction de données. - - Raise: - HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx). - """ - - if use_cache and subdir in self._latest_soups: - return self._latest_soups[subdir] - - markup: str = self.getresponse(subdir).text - soup: BeautifulSoup = BeautifulSoup(markup, features="html.parser") - - if use_cache: - self._latest_soups[subdir] = soup - - if len(self._latest_soups) > 10: - _ = self._latest_soups.popitem(last=False) - - return soup - - def getjsondata(self, subdir: str, id: str = "__NEXT_DATA__") -> _ScraperData: - """ - Extrait les données JSON contenues dans la balise __NEXT_DATA__ du site. - Beaucoup de sites modernes (Next.js) stockent leur état initial dans - une balise - - - """, - ) - - m.get( - "https://www.millesima.fr/poubelle", - text=f""" - - -

POUBELLE

- - - - """, - ) - - json_data = { - "props": { - "pageProps": { - "initialReduxState": { - "product": { - "content": { - "_id": "J4131/22-11652", - "partnumber": "J4131/22", - "productName": "Nino Negri : 5 Stelle Sfursat 2022", - "productNameForSearch": "Nino Negri : 5 Stelle Sfursat 2022", - "storeId": "11652", - "seoKeyword": "nino-negri-5-stelle-sfursat-2022.html", - "title": "Nino Negri : 5 Stelle Sfursat 2022", - "items": [ - { - "_id": "J4131/22/C/CC/6-11652", - "partnumber": "J4131/22/C/CC/6", - "taxRate": "H", - "listPrice": 842, - "offerPrice": 842, - "seoKeyword": "vin-de-charazade1867.html", - "shortdesc": "Une bouteille du meilleur vin du monde?", - "attributes": { - "promotion_o_n": { - "valueId": "0", - "name": "En promotion", - "value": "Non", - "sequence": 80, - "displayable": "False", - "type": "CHECKBOX", - "isSpirit": False, - }, - "in_stock": { - "valueId": "L", - "name": "En stock", - "value": "Livrable", - "sequence": 65, - "displayable": "true", - "type": "CHECKBOX", - "isSpirit": False, - }, - "equivbtl": { - "valueId": "1", - "name": "equivbtl", - "value": "1", - "isSpirit": False, - }, - "nbunit": { - "valueId": "1", - "name": "nbunit", - "value": "1", - "isSpirit": False, - }, - }, - "stock": 12, - "availability": "2026-02-05", - "isCustomizable": False, - "gtin_cond": "", - "gtin_unit": "", - "stockOrigin": "EUR", - "isPrevSale": False, - } - ], - "attributes": { - "appellation": { - "valueId": "433", - "name": "Appellation", - "value": "Madame-Loïk", - "url": "Madame-loik.html", - "isSpirit": False, - "groupIdentifier": "appellation_433", - }, - "note_rp": { - "valueId": "91", - "name": "Peter Parker", - "value": "91", - "isSpirit": False, - }, - "note_jr": { - "valueId": "17+", - "name": "J. Robinson", - "value": "17+", - "isSpirit": False, - }, - "note_js": { - "valueId": "93-94", - "name": "J. cherazade", - "value": "93-94", - "isSpirit": False, - }, - }, - } - } - } - } - } - } - - html_product = f""" - - -

MILLESIMA

- - - - """ - m.get( - "https://www.millesima.fr/nino-negri-5-stelle-sfursat-2022.html", - text=html_product, - ) - - html_product = f""" - - -

MILLESIMA

- - - - """ - - list_pleine = f""" - - -

LE WINE

- - - - """ - - list_vide = f""" - - -

LE WINE

- - - - """ - - m.get( - "https://www.millesima.fr/wine.html", - complete_qs=False, - response_list=[ - {"text": list_pleine}, - {"text": list_vide}, - ], - ) - - # on return m sans fermer le server qui simule la page - yield m - - -@pytest.fixture -def scraper() -> Scraper: - return Scraper() - - -def test_soup(scraper: Scraper): - vide = scraper.getsoup("") - poubelle = scraper.getsoup("poubelle") - contenu = scraper.getsoup("nino-negri-5-stelle-sfursat-2022.html") - assert vide.find("h1") is None - assert str(poubelle.find("h1")) == "

POUBELLE

" - assert str(contenu.find("h1")) == "

MILLESIMA

" - - -def test_appellation(scraper: Scraper): - vide = scraper.getjsondata("") - poubelle = scraper.getjsondata("poubelle") - contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") - assert vide.appellation() is None - assert poubelle.appellation() is None - assert contenu.appellation() == "Madame-Loïk" - - -def test_fonctionprivee(scraper: Scraper): - vide = scraper.getjsondata("") - poubelle = scraper.getjsondata("poubelle") - contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") - assert vide._getattributes() is not None - assert vide._getattributes() == {} - assert vide._getcontent() is not None - assert vide._getcontent() == {"items": [], "attributes": {}} - assert poubelle._getattributes() is None - assert poubelle._getcontent() is None - assert contenu._getcontent() is not None - assert contenu._getattributes() is not None - - -def test_critiques(scraper: Scraper): - vide = scraper.getjsondata("") - poubelle = scraper.getjsondata("poubelle") - contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") - assert vide.parker() is None - assert vide.robinson() is None - assert vide.suckling() is None - assert vide._getcritiques("test_ts") is None - assert poubelle.parker() is None - assert poubelle.robinson() is None - assert poubelle.suckling() is None - assert poubelle._getcritiques("test_ts") is None - assert contenu.parker() == "91" - assert contenu.robinson() == "17" - assert contenu.suckling() == "93.5" - assert contenu._getcritiques("test_ts") is None - - -def test_prix(scraper: Scraper): - vide = scraper.getjsondata("") - poubelle = scraper.getjsondata("poubelle") - contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") - assert vide.prix() is None - assert poubelle.prix() is None - assert contenu.prix() == 842.0 - - -def test_informations(scraper: Scraper): - contenu = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html") - assert contenu.informations() == "Madame-Loïk,91,17,93.5,842.0" - vide = scraper.getjsondata("") - poubelle = scraper.getjsondata("poubelle") - assert vide.informations() == "None,None,None,None,None" - assert poubelle.informations() == "None,None,None,None,None" - - -def test_search(scraper: Scraper): - m = mock_open() - with patch("builtins.open", m): - scraper.getvins("wine.html", "fake_file.csv") - - assert m().write.called - all_writes = "".join(call.args[0] for call in m().write.call_args_list) - assert "Madame-Loïk,91,17,93.5,842.0" in all_writes