mirror of
https://github.com/guezoloic/millesima_projetS6.git
synced 2026-03-28 19:13:42 +00:00
ajout: systeme de cache pour eviter recommencer
This commit is contained in:
20
src/main.py
20
src/main.py
@@ -1,20 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
from os import getcwd
|
|
||||||
from os.path import normpath, join
|
|
||||||
from sys import argv
|
|
||||||
from pandas import read_csv, DataFrame
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
if len(argv) != 2:
|
|
||||||
raise ValueError(f"{argv[0]} <filename.csv>")
|
|
||||||
|
|
||||||
path: str = normpath(join(getcwd(), argv[1]))
|
|
||||||
db: DataFrame = read_csv(path)
|
|
||||||
print(db.all())
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
main()
|
|
||||||
except Exception as e:
|
|
||||||
print(f"ERREUR: {e}")
|
|
||||||
123
src/scraper.py
123
src/scraper.py
@@ -1,12 +1,49 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from io import SEEK_END, SEEK_SET, BufferedWriter, BufferedReader
|
||||||
|
from os.path import exists, normpath, realpath, dirname, join
|
||||||
|
from os import makedirs
|
||||||
|
from pickle import dump, load, UnpicklingError
|
||||||
from sys import argv
|
from sys import argv
|
||||||
from typing import cast
|
from typing import Any, Callable, Literal, cast
|
||||||
from requests import HTTPError, Response, Session
|
from requests import HTTPError, Response, Session
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from json import JSONDecodeError, loads
|
from json import JSONDecodeError, loads
|
||||||
|
|
||||||
|
_dir: str = dirname(realpath(__name__))
|
||||||
|
|
||||||
|
|
||||||
|
def _getcache[T](mode: Literal["rb", "wb"], fn: Callable[[Any], T]) -> T | None:
|
||||||
|
"""_summary_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
_type_: _description_
|
||||||
|
"""
|
||||||
|
cache_dirname = normpath(join(_dir, ".cache"))
|
||||||
|
save_path = normpath(join(cache_dirname, "save"))
|
||||||
|
|
||||||
|
if not exists(cache_dirname):
|
||||||
|
makedirs(cache_dirname)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(save_path, mode) as f:
|
||||||
|
return fn(f)
|
||||||
|
except (FileNotFoundError, EOFError, UnpicklingError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def savestate(data: tuple[int, set[str]]) -> None:
|
||||||
|
def save(f: BufferedWriter) -> None:
|
||||||
|
_ = f.seek(0)
|
||||||
|
_ = f.truncate()
|
||||||
|
dump(data, f)
|
||||||
|
f.flush()
|
||||||
|
_getcache("wb", save)
|
||||||
|
|
||||||
|
def loadstate() -> tuple[int, set[str]] | None:
|
||||||
|
return _getcache("rb", lambda f: load(f))
|
||||||
|
|
||||||
|
|
||||||
class _ScraperData:
|
class _ScraperData:
|
||||||
"""
|
"""
|
||||||
@@ -189,7 +226,7 @@ class Scraper:
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""
|
"""
|
||||||
Initialise l'infrastructure de navigation:
|
Initialise l'infrastructure de navigation:
|
||||||
|
|
||||||
- créer une session pour éviter de faire un handshake pour chaque requête
|
- créer une session pour éviter de faire un handshake pour chaque requête
|
||||||
- ajout d'un header pour éviter le blocage de l'accès au site
|
- ajout d'un header pour éviter le blocage de l'accès au site
|
||||||
- ajout d'un système de cache
|
- ajout d'un système de cache
|
||||||
@@ -347,7 +384,7 @@ class Scraper:
|
|||||||
except (JSONDecodeError, HTTPError):
|
except (JSONDecodeError, HTTPError):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getvins(self, subdir: str, filename: str, reset: bool) -> None:
|
def getvins(self, subdir: str, filename: str, reset: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.
|
Scrape récursivement toutes les pages d'une catégorie et sauvegarde en CSV.
|
||||||
|
|
||||||
@@ -356,45 +393,65 @@ class Scraper:
|
|||||||
filename (str): Nom du fichier de sortie (ex: 'vins.csv').
|
filename (str): Nom du fichier de sortie (ex: 'vins.csv').
|
||||||
reset (bool): (Optionnel) pour réinitialiser le processus.
|
reset (bool): (Optionnel) pour réinitialiser le processus.
|
||||||
"""
|
"""
|
||||||
with open(filename, "w") as f:
|
mode: Literal["w", "a+"] = "w" if reset else "a+"
|
||||||
cache: set[str] = set[str]()
|
title: str = "Appellation,Robert,Robinson,Suckling,Prix\n"
|
||||||
page = 0
|
page: int = 1
|
||||||
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
|
cache: set[str] = set[str]()
|
||||||
|
|
||||||
while True:
|
if not reset:
|
||||||
page += 1
|
serializable: tuple[int, set[str]] | None = loadstate()
|
||||||
products_list: list[str] | None = self._geturlproductslist(
|
if isinstance(serializable, tuple):
|
||||||
f"{subdir}?page={page}"
|
page, cache = serializable
|
||||||
)
|
try:
|
||||||
|
with open(filename, mode) as f:
|
||||||
|
# check si le titre est bien présent au début du buffer
|
||||||
|
# sinon il l'ecrit, petit bug potentiel, a+ ecrit tout le
|
||||||
|
# temps a la fin du buffer, si on a ecrit des choses avant
|
||||||
|
# le titre sera apres ces données mais on part du principe
|
||||||
|
# que personne va toucher le fichier.
|
||||||
|
_ = f.seek(0, SEEK_SET)
|
||||||
|
if not (f.read(len(title)) == title):
|
||||||
|
_ = f.write(title)
|
||||||
|
else:
|
||||||
|
_ = f.seek(0, SEEK_END)
|
||||||
|
|
||||||
if not products_list:
|
while True:
|
||||||
break
|
products_list: list[str] | None = self._geturlproductslist(
|
||||||
|
f"{subdir}?page={page}"
|
||||||
|
)
|
||||||
|
|
||||||
products_list_length = len(products_list)
|
if not products_list:
|
||||||
for i, product in enumerate(products_list):
|
break
|
||||||
if not isinstance(product, dict):
|
|
||||||
continue
|
|
||||||
|
|
||||||
link = product.get("seoKeyword")
|
products_list_length = len(products_list)
|
||||||
|
for i, product in enumerate(products_list):
|
||||||
|
if not isinstance(product, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
if link and link not in cache:
|
link = product.get("seoKeyword")
|
||||||
try:
|
|
||||||
infos = self.getjsondata(link).informations()
|
if link and link not in cache:
|
||||||
_ = f.write(infos + "\n")
|
try:
|
||||||
print(
|
infos = self.getjsondata(link).informations()
|
||||||
f"page: {page} | {i + 1}/{products_list_length} {link}"
|
_ = f.write(infos + "\n")
|
||||||
)
|
print(
|
||||||
cache.add(link)
|
f"page: {page} | {i + 1}/{products_list_length} {link}"
|
||||||
except (JSONDecodeError, HTTPError) as e:
|
)
|
||||||
print(f"Erreur sur le produit {link}: {e}")
|
cache.add(link)
|
||||||
f.flush()
|
except (JSONDecodeError, HTTPError) as e:
|
||||||
|
print(f"Erreur sur le produit {link}: {e}")
|
||||||
|
f.flush()
|
||||||
|
page += 1
|
||||||
|
except:
|
||||||
|
if not reset:
|
||||||
|
savestate((page, cache))
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(argv) != 2:
|
if len(argv) != 3:
|
||||||
raise ValueError(f"{argv[0]} <sous-url>")
|
raise ValueError(f"{argv[0]} <filename> <sous-url>")
|
||||||
scraper: Scraper = Scraper()
|
scraper: Scraper = Scraper()
|
||||||
scraper.getvins(argv[1], "donnee.csv", False)
|
scraper.getvins(argv[2], argv[1])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user