ajout: fonctions de recherche

This commit is contained in:
2026-02-11 23:35:56 +01:00
parent ed86e588f7
commit 96dbaaaaf6
2 changed files with 63 additions and 31 deletions

67
main.py
View File

@@ -1,8 +1,8 @@
from typing import cast from typing import cast
from requests import Response, Session from requests import HTTPError, Response, Session
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from collections import OrderedDict from collections import OrderedDict
from json import loads from json import JSONDecodeError, loads
class _ScraperData: class _ScraperData:
@@ -284,41 +284,46 @@ class Scraper:
return _ScraperData(cast(dict[str, object], current_data)) return _ScraperData(cast(dict[str, object], current_data))
# def _geturlsearch(self, subdir: str, index: int) -> str | None: def _geturlproductslist(self, subdir: str):
# data: dict[str, object] = self.getjsondata(subdir).getdata() try:
data: dict[str, object] = self.getjsondata(subdir).getdata()
# for element in ["initialReduxState", "categ", "content"]: for element in ["initialReduxState", "categ", "content"]:
# data = cast(dict[str, object], data.get(element)) data = cast(dict[str, object], data.get(element))
# if data is None or not isinstance(data, dict): if data is None or not isinstance(data, dict):
# return None return None
# products = data.get("products") products = data.get("products")
# if not isinstance(products, list) or index >= len(products): if isinstance(products, list):
# return None return products
except JSONDecodeError | HTTPError:
return None
# product = products[index] def getvins(self, subdir: str):
# if isinstance(product, dict): cache: set[str] = set[str]()
# return str(product.get("seoKeyword"))
# return None for page in range(1, 64):
products_list = self._geturlproductslist(f"{subdir}?page={page}")
# def getvins(self, subdir: str) -> None: if not products_list:
# cache: set[str] = set[str]() break
for product in products_list:
if not isinstance(product, dict):
continue
# for page in range(1, 2): link = product.get("seoKeyword")
# index_link = 1
# while True:
# link: str | None = self._geturlsearch(
# subdir=f"{subdir}?page={page}", index=index_link
# )
# index_link+=1 if not link:
# if link is None: continue
# break
if link not in cache:
# if link not in cache: try:
# print(self.getjsondata(link).informations()) infos = self.getjsondata(link).informations()
# cache.add(link) print(infos)
cache.add(link)
except JSONDecodeError | HTTPError as e:
print(f"Erreur sur le produit {link}: {e}")
# Scraper().getvins("bordeaux.html") print(Scraper().getvins("bordeaux.html"))

27
test.json Normal file
View File

@@ -0,0 +1,27 @@
Haut-Médoc,None,None,90,16.1
Haut-Médoc,90.0,16,None,23.2
Haut-Médoc,None,None,None,14.2
Saint-Estèphe,93,17,96,59.0
Pessac-Léognan,90,16.5,94,36.0
Pessac-Léognan,89,16.5,95,39.2
Haut-Médoc,89.0,15,92,26.8
Haut-Médoc,92,16.5,93,65.4
Margaux,92,16,93,64.3
Moulis,92,15.5,93,41.4
Haut-Médoc,None,None,None,15.2
Pauillac,97,18,98,298.0
Saint-Emilion,95,15,96,106.4
Haut-Médoc,92,16,95,32.7
Pomerol,88,16,96,92.7
Pessac-Léognan,90,16.5,93,33.1
Haut-Médoc,89,14.5,None,18.8
Haut-Médoc,93.0,16,94,56.3
Pessac-Léognan,86,None,92,31.6
Haut-Médoc,89,16,92,25.1
Haut-Médoc,92,16,94.5,68.7
Saint-Estèphe,91.0,None,91.5,27.3
Pomerol,94,16.5,95,97.5
Margaux,93,16,95,54.5
Pessac-Léognan,93,16.5,93,46.9
Saint-Estèphe,92,16,96,50.1
Pessac-Léognan,93,16.5,94,60.9