diff --git a/src/main.py b/src/main.py deleted file mode 100755 index 59d1b2c..0000000 --- a/src/main.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python3 - -from os import getcwd -from os.path import normpath, join -from sys import argv -from pandas import read_csv, DataFrame - -def main() -> None: - if len(argv) != 2: - raise ValueError(f"{argv[0]} ") - - path: str = normpath(join(getcwd(), argv[1])) - db: DataFrame = read_csv(path) - print(db.all()) - -if __name__ == "__main__": - try: - main() - except Exception as e: - print(f"ERREUR: {e}") \ No newline at end of file diff --git a/src/scraper.py b/src/scraper.py index 0dcc474..4e0737e 100755 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,12 +1,49 @@ #!/usr/bin/env python3 +from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader +from os.path import exists, normpath, realpath, dirname, join +from os import makedirs +from pickle import dump, load, UnpicklingError from sys import argv -from typing import cast +from typing import Any, Callable, Literal, cast from requests import HTTPError, Response, Session from bs4 import BeautifulSoup, Tag from collections import OrderedDict from json import JSONDecodeError, loads +_dir: str = dirname(realpath(__name__)) + + +def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None: + """_summary_ + + Returns: + _type_: _description_ + """ + cache_dirname = normpath(join(_dir, ".cache")) + save_path = normpath(join(cache_dirname, "save")) + + if not exists(cache_dirname): + makedirs(cache_dirname) + + try: + with open(save_path, mode) as f: + return fn(f) + except (FileNotFoundError, EOFError, UnpicklingError): + return None + + +def savestate(data: tuple[int, set[str]]) -> None: + def save(f: BufferedWriter) -> None: + _ = f.seek(0) + _ = f.truncate() + dump(data, f) + f.flush() + _getcache("wb", save) + +def loadstate() -> tuple[int, set[str]] | None: + return _getcache("rb", lambda f: load(f)) + class _ScraperData: """ @@ -189,7 +226,7 @@ class Scraper: def __init__(self) -> None: """ Initialise l'infrastructure de navigation: - + - créer une session pour éviter de faire un handshake pour chaque requête - ajout d'un header pour éviter le blocage de l'accès au site - ajout d'un système de cache @@ -347,7 +384,7 @@ class Scraper: except (JSONDecodeError, HTTPError): return None - def getvins(self, subdir: str, filename: str, reset: bool) -> None: + def getvins(self, subdir: str, filename: str, reset: bool = False) -> None: """ Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV. @@ -356,45 +393,65 @@ class Scraper: filename (str): Nom du fichier de sortie (ex: 'vins.csv'). reset (bool): (Optionnel) pour réinitialiser le processus. """ - with open(filename, "w") as f: - cache: set[str] = set[str]() - page = 0 - _ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n") + mode: Literal["w", "a+"] = "w" if reset else "a+" + title: str = "Appellation,Robert,Robinson,Suckling,Prix\n" + page: int = 1 + cache: set[str] = set[str]() - while True: - page += 1 - products_list: list[str] | None = self._geturlproductslist( - f"{subdir}?page={page}" - ) + if not reset: + serializable: tuple[int, set[str]] | None = loadstate() + if isinstance(serializable, tuple): + page, cache = serializable + try: + with open(filename, mode) as f: + # check si le titre est bien présent au début du buffer + # sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le + # temps a la fin du buffer, si on a ecrit des choses avant + # le titre sera apres ces données mais on part du principe + # que personne va toucher le fichier. + _ = f.seek(0, SEEK_SET) + if not (f.read(len(title)) == title): + _ = f.write(title) + else: + _ = f.seek(0, SEEK_END) - if not products_list: - break + while True: + products_list: list[str] | None = self._geturlproductslist( + f"{subdir}?page={page}" + ) - products_list_length = len(products_list) - for i, product in enumerate(products_list): - if not isinstance(product, dict): - continue + if not products_list: + break - link = product.get("seoKeyword") + products_list_length = len(products_list) + for i, product in enumerate(products_list): + if not isinstance(product, dict): + continue - if link and link not in cache: - try: - infos = self.getjsondata(link).informations() - _ = f.write(infos + "\n") - print( - f"page: {page} | {i + 1}/{products_list_length} {link}" - ) - cache.add(link) - except (JSONDecodeError, HTTPError) as e: - print(f"Erreur sur le produit {link}: {e}") - f.flush() + link = product.get("seoKeyword") + + if link and link not in cache: + try: + infos = self.getjsondata(link).informations() + _ = f.write(infos + "\n") + print( + f"page: {page} | {i + 1}/{products_list_length} {link}" + ) + cache.add(link) + except (JSONDecodeError, HTTPError) as e: + print(f"Erreur sur le produit {link}: {e}") + f.flush() + page += 1 + except: + if not reset: + savestate((page, cache)) def main() -> None: - if len(argv) != 2: - raise ValueError(f"{argv[0]} ") + if len(argv) != 3: + raise ValueError(f"{argv[0]} ") scraper: Scraper = Scraper() - scraper.getvins(argv[1], "donnee.csv", False) + scraper.getvins(argv[2], argv[1]) if __name__ == "__main__": diff --git a/tests/test_main.py b/tests/test_main.py deleted file mode 100644 index e69de29..0000000