ajout: systeme de cache pour eviter recommencer

2026-03-28 18:03:47 +00:00 · 2026-03-02 18:30:26 +01:00
parent 123c43aa05
commit 3619890dc4
3 changed files with 90 additions and 53 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-
-from os import getcwd
-from os.path import normpath, join
-from sys import argv
-from pandas import read_csv, DataFrame
-
-def main() -> None:
-    if len(argv) != 2:
-        raise ValueError(f"{argv[0]} <filename.csv>")
-
-    path: str = normpath(join(getcwd(), argv[1]))
-    db: DataFrame = read_csv(path)
-    print(db.all())
-
-if __name__ == "__main__":
-    try:
-        main()
-    except Exception as e:
-        print(f"ERREUR: {e}")
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,12 +1,49 @@
 #!/usr/bin/env python3

+from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader
+from os.path import exists, normpath, realpath, dirname, join
+from os import makedirs
+from pickle import dump, load, UnpicklingError
 from sys import argv
-from typing import cast
+from typing import Any, Callable, Literal, cast
 from requests import HTTPError, Response, Session
 from bs4 import BeautifulSoup, Tag
 from collections import OrderedDict
 from json import JSONDecodeError, loads

+_dir: str = dirname(realpath(__name__))
+
+
+def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
+    """_summary_
+
+    Returns:
+        _type_: _description_
+    """
+    cache_dirname = normpath(join(_dir, ".cache"))
+    save_path = normpath(join(cache_dirname, "save"))
+
+    if not exists(cache_dirname):
+        makedirs(cache_dirname)
+
+    try:
+        with open(save_path, mode) as f:
+            return fn(f)
+    except (FileNotFoundError, EOFError, UnpicklingError):
+        return None
+
+
+def savestate(data: tuple[int, set[str]]) -> None:
+    def save(f: BufferedWriter) -> None:
+        _ = f.seek(0)
+        _ = f.truncate()
+        dump(data, f)
+        f.flush()
+    _getcache("wb", save)
+
+def loadstate() -> tuple[int, set[str]] | None:
+    return _getcache("rb", lambda f: load(f))
+

 class _ScraperData:
    """
@@ -347,7 +384,7 @@ class Scraper:
        except (JSONDecodeError, HTTPError):
            return None

-    def getvins(self, subdir: str, filename: str, reset: bool) -> None:
+    def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
        """
        Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.

@@ -356,13 +393,29 @@ class Scraper:
            filename (str): Nom du fichier de sortie (ex: 'vins.csv').
            reset (bool): (Optionnel) pour réinitialiser le processus.
        """
-        with open(filename, "w") as f:
+        mode: Literal["w", "a+"] = "w" if reset else "a+"
+        title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
+        page: int = 1
        cache: set[str] = set[str]()
-            page = 0
-            _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
+
+        if not reset:
+            serializable: tuple[int, set[str]] | None = loadstate()
+            if isinstance(serializable, tuple):
+                page, cache = serializable
+        try:
+            with open(filename, mode) as f:
+                # check si le titre est bien présent au début du buffer
+                # sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
+                # temps a la fin du buffer, si on a ecrit des choses avant
+                # le titre sera apres ces données mais on part du principe
+                # que personne va toucher le fichier.
+                _ = f.seek(0, SEEK_SET)
+                if not (f.read(len(title)) == title):
+                    _ = f.write(title)
+                else:
+                    _ = f.seek(0, SEEK_END)

                while True:
-                page += 1
                    products_list: list[str] | None = self._geturlproductslist(
                        f"{subdir}?page={page}"
                    )
@@ -388,13 +441,17 @@ class Scraper:
                            except (JSONDecodeError, HTTPError) as e:
                                print(f"Erreur sur le produit {link}: {e}")
                    f.flush()
+                    page += 1
+        except:
+            if not reset:
+                savestate((page, cache))


 def main() -> None:
-    if len(argv) != 2:
-        raise ValueError(f"{argv[0]} <sous-url>")
+    if len(argv) != 3:
+        raise ValueError(f"{argv[0]} <filename> <sous-url>")
    scraper: Scraper = Scraper()
-    scraper.getvins(argv[1], "donnee.csv", False)
+    scraper.getvins(argv[2], argv[1])


 if __name__ == "__main__":
--- a/tests/test_main.py
+++ b/tests/test_main.py