ajout: systeme de cache pour eviter recommencer

This commit is contained in:
2026-03-02 18:30:26 +01:00
parent 123c43aa05
commit 3619890dc4
3 changed files with 90 additions and 53 deletions

View File

@@ -1,20 +0,0 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from sys import argv
from pandas import read_csv, DataFrame
def main() -> None:
if len(argv) != 2:
raise ValueError(f"{argv[0]} <filename.csv>")
path: str = normpath(join(getcwd(), argv[1]))
db: DataFrame = read_csv(path)
print(db.all())
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

View File

@@ -1,12 +1,49 @@
#!/usr/bin/env python3
from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader
from os.path import exists, normpath, realpath, dirname, join
from os import makedirs
from pickle import dump, load, UnpicklingError
from sys import argv
from typing import cast
from typing import Any, Callable, Literal, cast
from requests import HTTPError, Response, Session
from bs4 import BeautifulSoup, Tag
from collections import OrderedDict
from json import JSONDecodeError, loads
_dir: str = dirname(realpath(__name__))
def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
"""_summary_
Returns:
_type_: _description_
"""
cache_dirname = normpath(join(_dir, ".cache"))
save_path = normpath(join(cache_dirname, "save"))
if not exists(cache_dirname):
makedirs(cache_dirname)
try:
with open(save_path, mode) as f:
return fn(f)
except (FileNotFoundError, EOFError, UnpicklingError):
return None
def savestate(data: tuple[int, set[str]]) -> None:
def save(f: BufferedWriter) -> None:
_ = f.seek(0)
_ = f.truncate()
dump(data, f)
f.flush()
_getcache("wb", save)
def loadstate() -> tuple[int, set[str]] | None:
return _getcache("rb", lambda f: load(f))
class _ScraperData:
"""
@@ -347,7 +384,7 @@ class Scraper:
except (JSONDecodeError, HTTPError):
return None
def getvins(self, subdir: str, filename: str, reset: bool) -> None:
def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
"""
Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.
@@ -356,13 +393,29 @@ class Scraper:
filename (str): Nom du fichier de sortie (ex: 'vins.csv').
reset (bool): (Optionnel) pour réinitialiser le processus.
"""
with open(filename, "w") as f:
mode: Literal["w", "a+"] = "w" if reset else "a+"
title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
page: int = 1
cache: set[str] = set[str]()
page = 0
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
if not reset:
serializable: tuple[int, set[str]] | None = loadstate()
if isinstance(serializable, tuple):
page, cache = serializable
try:
with open(filename, mode) as f:
# check si le titre est bien présent au début du buffer
# sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
# temps a la fin du buffer, si on a ecrit des choses avant
# le titre sera apres ces données mais on part du principe
# que personne va toucher le fichier.
_ = f.seek(0, SEEK_SET)
if not (f.read(len(title)) == title):
_ = f.write(title)
else:
_ = f.seek(0, SEEK_END)
while True:
page += 1
products_list: list[str] | None = self._geturlproductslist(
f"{subdir}?page={page}"
)
@@ -388,13 +441,17 @@ class Scraper:
except (JSONDecodeError, HTTPError) as e:
print(f"Erreur sur le produit {link}: {e}")
f.flush()
page += 1
except:
if not reset:
savestate((page, cache))
def main() -> None:
if len(argv) != 2:
raise ValueError(f"{argv[0]} <sous-url>")
if len(argv) != 3:
raise ValueError(f"{argv[0]} <filename> <sous-url>")
scraper: Scraper = Scraper()
scraper.getvins(argv[1], "donnee.csv", False)
scraper.getvins(argv[2], argv[1])
if __name__ == "__main__":

View File