ajout: systeme de cache pour eviter recommencer

2026-03-28 19:13:42 +00:00 · 2026-03-02 18:30:26 +01:00
parent 123c43aa05
commit 3619890dc4
3 changed files with 90 additions and 53 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -1,20 +0,0 @@
 #!/usr/bin/env python3
 from os import getcwd
 from os.path import normpath, join
 from sys import argv
 from pandas import read_csv, DataFrame
 def main() -> None:
    if len(argv) != 2:
        raise ValueError(f"{argv[0]} <filename.csv>")
    path: str = normpath(join(getcwd(), argv[1]))
    db: DataFrame = read_csv(path)
    print(db.all())
 if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"ERREUR: {e}")
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,12 +1,49 @@
 #!/usr/bin/env python3
 from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader
 from os.path import exists, normpath, realpath, dirname, join
 from os import makedirs
 from pickle import dump, load, UnpicklingError
 from sys import argv
-from typing import cast
+from typing import Any, Callable, Literal, cast
 from requests import HTTPError, Response, Session
 from bs4 import BeautifulSoup, Tag
 from collections import OrderedDict
 from json import JSONDecodeError, loads
 _dir: str = dirname(realpath(__name__))
 def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
    """_summary_
    Returns:
        _type_: _description_
    """
    cache_dirname = normpath(join(_dir, ".cache"))
    save_path = normpath(join(cache_dirname, "save"))
    if not exists(cache_dirname):
        makedirs(cache_dirname)
    try:
        with open(save_path, mode) as f:
            return fn(f)
    except (FileNotFoundError, EOFError, UnpicklingError):
        return None
 def savestate(data: tuple[int, set[str]]) -> None:
    def save(f: BufferedWriter) -> None:
        _ = f.seek(0)
        _ = f.truncate()
        dump(data, f)
        f.flush()
    _getcache("wb", save)
 def loadstate() -> tuple[int, set[str]] | None:
    return _getcache("rb", lambda f: load(f))
 class _ScraperData:
    """
@@ -189,7 +226,7 @@ class Scraper:
    def __init__(self) -> None:
        """
        Initialise l'infrastructure de navigation:
-        
+
         - créer une session pour éviter de faire un handshake pour chaque requête
         - ajout d'un header pour éviter le blocage de l'accès au site
         - ajout d'un système de cache
@@ -347,7 +384,7 @@ class Scraper:
        except (JSONDecodeError, HTTPError):
            return None
-    def getvins(self, subdir: str, filename: str, reset: bool) -> None:
+    def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
        """
        Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.
@@ -356,45 +393,65 @@ class Scraper:
            filename (str): Nom du fichier de sortie (ex: 'vins.csv').
            reset (bool): (Optionnel) pour réinitialiser le processus.
        """
-        with open(filename, "w") as f:
+        mode: Literal["w", "a+"] = "w" if reset else "a+"
-            cache: set[str] = set[str]()
+        title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
-            page = 0
+        page: int = 1
-            _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
+        cache: set[str] = set[str]()
-            while True:
+        if not reset:
-                page += 1
+            serializable: tuple[int, set[str]] | None = loadstate()
-                products_list: list[str] | None = self._geturlproductslist(
+            if isinstance(serializable, tuple):
-                    f"{subdir}?page={page}"
+                page, cache = serializable
-                )
+        try:
            with open(filename, mode) as f:
                # check si le titre est bien présent au début du buffer
                # sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
                # temps a la fin du buffer, si on a ecrit des choses avant
                # le titre sera apres ces données mais on part du principe
                # que personne va toucher le fichier.
                _ = f.seek(0, SEEK_SET)
                if not (f.read(len(title)) == title):
                    _ = f.write(title)
                else:
                    _ = f.seek(0, SEEK_END)
-                if not products_list:
+                while True:
-                    break
+                    products_list: list[str] | None = self._geturlproductslist(
                        f"{subdir}?page={page}"
                    )
-                products_list_length = len(products_list)
+                    if not products_list:
-                for i, product in enumerate(products_list):
+                        break
                    if not isinstance(product, dict):
                        continue
-                    link = product.get("seoKeyword")
+                    products_list_length = len(products_list)
                    for i, product in enumerate(products_list):
                        if not isinstance(product, dict):
                            continue
-                    if link and link not in cache:
+                        link = product.get("seoKeyword")
-                        try:
+
-                            infos = self.getjsondata(link).informations()
+                        if link and link not in cache:
-                            _ = f.write(infos + "\n")
+                            try:
-                            print(
+                                infos = self.getjsondata(link).informations()
-                                f"page: {page} | {i + 1}/{products_list_length} {link}"
+                                _ = f.write(infos + "\n")
-                            )
+                                print(
-                            cache.add(link)
+                                    f"page: {page} | {i + 1}/{products_list_length} {link}"
-                        except (JSONDecodeError, HTTPError) as e:
+                                )
-                            print(f"Erreur sur le produit {link}: {e}")
+                                cache.add(link)
-                f.flush()
+                            except (JSONDecodeError, HTTPError) as e:
                                print(f"Erreur sur le produit {link}: {e}")
                    f.flush()
                    page += 1
        except:
            if not reset:
                savestate((page, cache))
 def main() -> None:
-    if len(argv) != 2:
+    if len(argv) != 3:
-        raise ValueError(f"{argv[0]} <sous-url>")
+        raise ValueError(f"{argv[0]} <filename> <sous-url>")
    scraper: Scraper = Scraper()
-    scraper.getvins(argv[1], "donnee.csv", False)
+    scraper.getvins(argv[2], argv[1])
 if __name__ == "__main__":
--- a/tests/test_main.py
+++ b/tests/test_main.py