ajout: fonctions de recherche

This commit is contained in:
2026-02-11 23:35:56 +01:00
parent ed86e588f7
commit 96dbaaaaf6
2 changed files with 63 additions and 31 deletions

65
main.py
View File

@@ -1,8 +1,8 @@
from typing import cast
from requests import Response, Session
from requests import HTTPError, Response, Session
from bs4 import BeautifulSoup, Tag
from collections import OrderedDict
from json import loads
from json import JSONDecodeError, loads
class _ScraperData:
@@ -284,41 +284,46 @@ class Scraper:
return _ScraperData(cast(dict[str, object], current_data))
# def _geturlsearch(self, subdir: str, index: int) -> str | None:
# data: dict[str, object] = self.getjsondata(subdir).getdata()
def _geturlproductslist(self, subdir: str):
try:
data: dict[str, object] = self.getjsondata(subdir).getdata()
# for element in ["initialReduxState", "categ", "content"]:
# data = cast(dict[str, object], data.get(element))
# if data is None or not isinstance(data, dict):
# return None
for element in ["initialReduxState", "categ", "content"]:
data = cast(dict[str, object], data.get(element))
if data is None or not isinstance(data, dict):
return None
# products = data.get("products")
# if not isinstance(products, list) or index >= len(products):
# return None
products = data.get("products")
if isinstance(products, list):
return products
except JSONDecodeError | HTTPError:
return None
# product = products[index]
# if isinstance(product, dict):
# return str(product.get("seoKeyword"))
def getvins(self, subdir: str):
cache: set[str] = set[str]()
# return None
for page in range(1, 64):
products_list = self._geturlproductslist(f"{subdir}?page={page}")
# def getvins(self, subdir: str) -> None:
# cache: set[str] = set[str]()
if not products_list:
break
# for page in range(1, 2):
# index_link = 1
# while True:
# link: str | None = self._geturlsearch(
# subdir=f"{subdir}?page={page}", index=index_link
# )
for product in products_list:
if not isinstance(product, dict):
continue
# index_link+=1
# if link is None:
# break
link = product.get("seoKeyword")
# if link not in cache:
# print(self.getjsondata(link).informations())
# cache.add(link)
if not link:
continue
if link not in cache:
try:
infos = self.getjsondata(link).informations()
print(infos)
cache.add(link)
except JSONDecodeError | HTTPError as e:
print(f"Erreur sur le produit {link}: {e}")
# Scraper().getvins("bordeaux.html")
print(Scraper().getvins("bordeaux.html"))

27
test.json Normal file
View File

@@ -0,0 +1,27 @@
Haut-Médoc,None,None,90,16.1
Haut-Médoc,90.0,16,None,23.2
Haut-Médoc,None,None,None,14.2
Saint-Estèphe,93,17,96,59.0
Pessac-Léognan,90,16.5,94,36.0
Pessac-Léognan,89,16.5,95,39.2
Haut-Médoc,89.0,15,92,26.8
Haut-Médoc,92,16.5,93,65.4
Margaux,92,16,93,64.3
Moulis,92,15.5,93,41.4
Haut-Médoc,None,None,None,15.2
Pauillac,97,18,98,298.0
Saint-Emilion,95,15,96,106.4
Haut-Médoc,92,16,95,32.7
Pomerol,88,16,96,92.7
Pessac-Léognan,90,16.5,93,33.1
Haut-Médoc,89,14.5,None,18.8
Haut-Médoc,93.0,16,94,56.3
Pessac-Léognan,86,None,92,31.6
Haut-Médoc,89,16,92,25.1
Haut-Médoc,92,16,94.5,68.7
Saint-Estèphe,91.0,None,91.5,27.3
Pomerol,94,16.5,95,97.5
Margaux,93,16,95,54.5
Pessac-Léognan,93,16.5,93,46.9
Saint-Estèphe,92,16,96,50.1
Pessac-Léognan,93,16.5,94,60.9