From e6c649b43313f954870cd80cb7ad7a74fd384b1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20GUEZO?= <loicguezo@gmail.com>
Date: Mon, 2 Mar 2026 21:42:23 +0100
Subject: [PATCH] ajout: ajout factorisation vin et meilleure barre

---
 pyproject.toml        |  2 +-
 src/scraper.py        | 99 ++++++++++++++++++++++++++-----------------
 tests/test_scraper.py | 10 ++---
 3 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3d9454d..6d14d59 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "projet-millesima-s6"
 version = "0.1.0"
-dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3"]
+dependencies = ["requests==2.32.5", "beautifulsoup4==4.14.3", "pandas==2.3.3", "tqdm==4.67.3"]
 
 [project.optional-dependencies]
 test = ["pytest==8.4.2", "requests-mock==1.12.1", "flake8==7.3.0"]
diff --git a/src/scraper.py b/src/scraper.py
index 4e0737e..5873e22 100755
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,20 +1,23 @@
 #!/usr/bin/env python3
 
-from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader
-from os.path import exists, normpath, realpath, dirname, join
-from os import makedirs
-from pickle import dump, load, UnpicklingError
-from sys import argv
-from typing import Any, Callable, Literal, cast
-from requests import HTTPError, Response, Session
-from bs4 import BeautifulSoup, Tag
 from collections import OrderedDict
+from io import SEEK_END, SEEK_SET, BufferedWriter
 from json import JSONDecodeError, loads
+from os import makedirs
+from os.path import dirname, exists, join, normpath, realpath
+from pickle import UnpicklingError, dump, load
+from sys import argv
+from tqdm.std import tqdm
+from typing import Any, Callable, Literal, TypeVar, cast
+from bs4 import BeautifulSoup, Tag
+from requests import HTTPError, Response, Session
 
 _dir: str = dirname(realpath(__name__))
 
+T = TypeVar("T")
 
-def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
+
+def _getcache(mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
     """_summary_
 
     Returns:
@@ -39,8 +42,10 @@ def savestate(data: tuple[int, set[str]]) -> None:
         _ = f.truncate()
         dump(data, f)
         f.flush()
+
     _getcache("wb", save)
 
+
 def loadstate() -> tuple[int, set[str]] | None:
     return _getcache("rb", lambda f: load(f))
 
@@ -147,7 +152,6 @@ class _ScraperData:
             str | None: Le nom (ex: 'Pauillac') ou None.
         """
         attrs: dict[str, object] | None = self._getattributes()
-
         if attrs is not None:
             app_dict: object | None = attrs.get("appellation")
             if isinstance(app_dict, dict):
@@ -365,7 +369,7 @@ class Scraper:
 
         return _ScraperData(cast(dict[str, object], current_data))
 
-    def _geturlproductslist(self, subdir: str) -> list[str] | None:
+    def _geturlproductslist(self, subdir: str) -> list[dict[str, Any]] | None:
         """
         Récupère la liste des produits d'une page de catégorie.
         """
@@ -373,32 +377,61 @@ class Scraper:
             data: dict[str, object] = self.getjsondata(subdir).getdata()
 
             for element in ["initialReduxState", "categ", "content"]:
-                data: dict[str, object] = cast(dict[str, object], data.get(element))
-                if not isinstance(data, dict):
-                    return None
+                data = cast(dict[str, object], data.get(element))
+
+            products: list[dict[str, Any]] = cast(
+                list[dict[str, Any]], data.get("products")
+            )
 
-            products: list[str] = cast(list[str], data.get("products"))
             if isinstance(products, list):
                 return products
 
         except (JSONDecodeError, HTTPError):
             return None
 
+    def _writevins(self, cache: set[str], product: dict[str, Any], f: Any) -> None:
+        """_summary_
+
+        Args:
+            cache (set[str]): _description_
+            product (dict): _description_
+            f (Any): _description_
+        """
+        if isinstance(product, dict):
+            link: Any | None = product.get("seoKeyword")
+            if link and link not in cache:
+                try:
+                    infos = self.getjsondata(link).informations()
+                    _ = f.write(infos + "\n")
+                    cache.add(link)
+                except (JSONDecodeError, HTTPError) as e:
+                    print(f"Erreur sur le produit {link}: {e}")
+
     def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
         """
-        Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.
+        Scrape  toutes les pages d'une catégorie et sauvegarde en CSV.
 
         Args:
             subdir (str): La catégorie (ex: '/vins-rouges').
             filename (str): Nom du fichier de sortie (ex: 'vins.csv').
             reset (bool): (Optionnel) pour réinitialiser le processus.
         """
+        # mode d'écriture fichier
         mode: Literal["w", "a+"] = "w" if reset else "a+"
+        # titre
         title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
+        # page du début
         page: int = 1
+        # le set qui sert de cache
         cache: set[str] = set[str]()
 
+        custom_format = "{l_bar} {bar:20} {r_bar}"
+
         if not reset:
+            # appelle la fonction pour load le cache, si il existe
+            # pas, il utilise les variables de base sinon il override
+            # toute les variables pour continuer et pas recommencer le
+            # processus en entier.
             serializable: tuple[int, set[str]] | None = loadstate()
             if isinstance(serializable, tuple):
                 page, cache = serializable
@@ -416,33 +449,23 @@ class Scraper:
                     _ = f.seek(0, SEEK_END)
 
                 while True:
-                    products_list: list[str] | None = self._geturlproductslist(
-                        f"{subdir}?page={page}"
+                    products_list: list[dict[str, Any]] | None = (
+                        self._geturlproductslist(f"{subdir}?page={page}")
                     )
-
                     if not products_list:
                         break
 
-                    products_list_length = len(products_list)
-                    for i, product in enumerate(products_list):
-                        if not isinstance(product, dict):
-                            continue
-
-                        link = product.get("seoKeyword")
-
-                        if link and link not in cache:
-                            try:
-                                infos = self.getjsondata(link).informations()
-                                _ = f.write(infos + "\n")
-                                print(
-                                    f"page: {page} | {i + 1}/{products_list_length} {link}"
-                                )
-                                cache.add(link)
-                            except (JSONDecodeError, HTTPError) as e:
-                                print(f"Erreur sur le produit {link}: {e}")
-                    f.flush()
+                    pbar: tqdm[dict[str, Any]] = tqdm(
+                        products_list, bar_format=custom_format
+                    )
+                    for product in pbar:
+                        keyword = product.get("seoKeyword", "Inconnu")[:40]
+                        pbar.set_description(
+                            f"Page: {page:<3} | Product: {keyword:<40}"
+                        )
+                        self._writevins(cache, product, f)
                     page += 1
-        except:
+        except Exception:
             if not reset:
                 savestate((page, cache))
 
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
index 5bd6b43..fb4401a 100644
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -153,7 +153,7 @@ def mock_site():
 
         html_product = f"""
         <html>
-            <body>    
+            <body>
                 <h1>MILLESIMA</h1>
                 <script id="__NEXT_DATA__" type="application/json">
                     {dumps(json_data)}
@@ -168,7 +168,7 @@ def mock_site():
 
         html_product = f"""
         <html>
-            <body>    
+            <body>
                 <h1>MILLESIMA</h1>
                 <script id="__NEXT_DATA__" type="application/json">
                     {dumps(json_data)}
@@ -179,7 +179,7 @@ def mock_site():
 
         list_pleine = f"""
             <html>
-                <body>    
+                <body>
                     <h1>LE WINE</h1>
                     <script id="__NEXT_DATA__" type="application/json">
                         {dumps({
@@ -207,7 +207,7 @@ def mock_site():
 
         list_vide = f"""
             <html>
-                <body>    
+                <body>
                     <h1>LE WINE</h1>
                     <script id="__NEXT_DATA__" type="application/json">
                         {dumps({
@@ -319,7 +319,7 @@ def test_informations(scraper: Scraper):
 def test_search(scraper: Scraper):
     m = mock_open()
     with patch("builtins.open", m):
-        scraper.getvins("wine.html", "fake_file.csv", False)
+        scraper.getvins("wine.html", "fake_file.csv", True)
 
     assert m().write.called
     all_writes = "".join(call.args[0] for call in m().write.call_args_list)