diff --git a/main.py b/main.py index 62d3d22..a08da22 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,8 @@ from typing import cast -from requests import Response, Session +from requests import HTTPError, Response, Session from bs4 import BeautifulSoup, Tag from collections import OrderedDict -from json import loads +from json import JSONDecodeError, loads class _ScraperData: @@ -284,41 +284,46 @@ class Scraper: return _ScraperData(cast(dict[str, object], current_data)) - # def _geturlsearch(self, subdir: str, index: int) -> str | None: - # data: dict[str, object] = self.getjsondata(subdir).getdata() + def _geturlproductslist(self, subdir: str): + try: + data: dict[str, object] = self.getjsondata(subdir).getdata() - # for element in ["initialReduxState", "categ", "content"]: - # data = cast(dict[str, object], data.get(element)) - # if data is None or not isinstance(data, dict): - # return None + for element in ["initialReduxState", "categ", "content"]: + data = cast(dict[str, object], data.get(element)) + if data is None or not isinstance(data, dict): + return None - # products = data.get("products") - # if not isinstance(products, list) or index >= len(products): - # return None + products = data.get("products") + if isinstance(products, list): + return products + except JSONDecodeError | HTTPError: + return None - # product = products[index] - # if isinstance(product, dict): - # return str(product.get("seoKeyword")) + def getvins(self, subdir: str): + cache: set[str] = set[str]() - # return None + for page in range(1, 64): + products_list = self._geturlproductslist(f"{subdir}?page={page}") - # def getvins(self, subdir: str) -> None: - # cache: set[str] = set[str]() + if not products_list: + break + + for product in products_list: + if not isinstance(product, dict): + continue - # for page in range(1, 2): - # index_link = 1 - # while True: - # link: str | None = self._geturlsearch( - # subdir=f"{subdir}?page={page}", index=index_link - # ) + link = product.get("seoKeyword") - # index_link+=1 - # if link is None: - # break - - # if link not in cache: - # print(self.getjsondata(link).informations()) - # cache.add(link) + if not link: + continue + + if link not in cache: + try: + infos = self.getjsondata(link).informations() + print(infos) + cache.add(link) + except JSONDecodeError | HTTPError as e: + print(f"Erreur sur le produit {link}: {e}") -# Scraper().getvins("bordeaux.html") +print(Scraper().getvins("bordeaux.html")) diff --git a/test.json b/test.json new file mode 100644 index 0000000..df6f179 --- /dev/null +++ b/test.json @@ -0,0 +1,27 @@ +Haut-Médoc,None,None,90,16.1 +Haut-Médoc,90.0,16,None,23.2 +Haut-Médoc,None,None,None,14.2 +Saint-Estèphe,93,17,96,59.0 +Pessac-Léognan,90,16.5,94,36.0 +Pessac-Léognan,89,16.5,95,39.2 +Haut-Médoc,89.0,15,92,26.8 +Haut-Médoc,92,16.5,93,65.4 +Margaux,92,16,93,64.3 +Moulis,92,15.5,93,41.4 +Haut-Médoc,None,None,None,15.2 +Pauillac,97,18,98,298.0 +Saint-Emilion,95,15,96,106.4 +Haut-Médoc,92,16,95,32.7 +Pomerol,88,16,96,92.7 +Pessac-Léognan,90,16.5,93,33.1 +Haut-Médoc,89,14.5,None,18.8 +Haut-Médoc,93.0,16,94,56.3 +Pessac-Léognan,86,None,92,31.6 +Haut-Médoc,89,16,92,25.1 +Haut-Médoc,92,16,94.5,68.7 +Saint-Estèphe,91.0,None,91.5,27.3 +Pomerol,94,16.5,95,97.5 +Margaux,93,16,95,54.5 +Pessac-Léognan,93,16.5,93,46.9 +Saint-Estèphe,92,16,96,50.1 +Pessac-Léognan,93,16.5,94,60.9