ajout: nouvelle classe pour données et testes

This commit is contained in:
2026-02-07 20:36:09 +01:00
parent 76017e3ea3
commit 74482af7f0
2 changed files with 79 additions and 25 deletions

58
main.py
View File

@@ -1,5 +1,5 @@
from sys import stderr
from typing import cast
from typing import Any, cast
from requests import Response, Session
from bs4 import BeautifulSoup, Tag
from json import JSONDecodeError, loads
@@ -22,6 +22,7 @@ class Scraper:
self._session: Session = Session()
# Système de cache pour éviter de solliciter le serveur inutilement
self._latest_request: tuple[(str, Response | None)] = ("", None)
self._latest_soup: tuple[(str, BeautifulSoup | None)] = ("", None)
def _request(self, subdir: str) -> Response:
"""
@@ -56,13 +57,13 @@ class Scraper:
"""
rq_subdir, rq_response = self._latest_request
if rq_response is None or subdir != rq_subdir:
if rq_response is not None and subdir == rq_subdir:
return rq_response
request: Response = self._request(subdir)
self._latest_request = (subdir, request)
return request
return rq_response
def getsoup(self, subdir: str = "") -> BeautifulSoup:
"""
Récupère le contenu HTML d'une page et le transforme en objet BeautifulSoup.
@@ -76,12 +77,19 @@ class Scraper:
Raise:
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
"""
markup: str = self.getresponse(subdir).text
return BeautifulSoup(markup, features="html.parser")
rq_subdir, rq_soup = self._latest_soup
def getjsondata(
self, subdir: str = "", id: str = "__NEXT_DATA__"
) -> dict[str, object]:
if rq_soup is not None and subdir == rq_subdir:
return rq_soup
soup: BeautifulSoup = BeautifulSoup(
markup=self.getresponse(subdir).text, features="html.parser"
)
self._latest_soup = (subdir, soup)
return soup
def getjsondata(self, subdir: str = "", id: str = "__NEXT_DATA__") -> ScraperData:
"""
Extrait les données JSON contenues dans la balise __NEXT_DATA__ du site.
Beaucoup de sites modernes (Next.js) stockent leur état initial dans
@@ -120,13 +128,39 @@ class Scraper:
# si current_data est bien un dictionnaire et que la clé
# est bien dedans
if isinstance(current_data, dict) and key in current_data:
current_data = current_data[key]
current_data: object = current_data[key]
else:
raise ValueError(f"Clé manquante dans le JSON : {key}")
if isinstance(current_data, dict):
return cast(dict[str, object], current_data)
return ScraperData(data=cast(dict[str, object], current_data))
except (JSONDecodeError, ValueError) as e:
print(f"Erreur lors de l'extraction JSON : {e}", file=stderr)
return {}
return ScraperData({})
class ScraperData:
def __init__(self, data: dict[str, object]) -> None:
if not data:
raise ValueError("Données insuffisantes pour créer un ScraperData.")
self._data: dict[str, object] = data
def _getattributes(self) -> dict[str, object] | None:
current_data: object = self._data.get("attributes")
if isinstance(current_data, dict):
return cast(dict[str, object], current_data)
return None
def appellation(self) -> str | None:
current_value: dict[str, object] | None = self._getattributes()
if current_value is not None:
app_dict: dict[str, object] = cast(
dict[str, object], current_value.get("appellation")
)
if app_dict:
return cast(str, app_dict.get("value"))
return None
def getdata(self) -> dict[str, object]:
return self._data

View File

@@ -2,7 +2,7 @@ from json import dumps
from bs4 import Tag
import pytest
from requests_mock import Mocker
from main import Scraper
from main import Scraper, ScraperData
@pytest.fixture(autouse=True)
@@ -67,6 +67,16 @@ def mock_site():
"isPrevSale": False,
}
],
"attributes": {
"appellation": {
"valueId": "433",
"name": "Appellation",
"value": "Sforzato di Valtellina",
"url": "sforzato-di-valtellina.html",
"isSpirit": False,
"groupIdentifier": "appellation_433",
},
},
}
}
}
@@ -82,7 +92,10 @@ def mock_site():
</body>
</html>
"""
m.get("https://www.millesima.fr/nino-negri-5-stelle-sfursat-2022.html", text=html_product)
m.get(
"https://www.millesima.fr/nino-negri-5-stelle-sfursat-2022.html",
text=html_product,
)
# on return m sans fermer le server qui simule la page
yield m
@@ -100,9 +113,16 @@ def test_soup(scraper: Scraper):
assert h1.text == "MILLESIMA"
def test_getProductName(scraper: Scraper):
jsondata = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html")
assert jsondata["productName"] == "Nino Negri : 5 Stelle Sfursat 2022"
assert isinstance(jsondata["items"], list)
assert len(jsondata["items"]) > 0
assert jsondata["items"][0]["offerPrice"] == 390
# def test_getProductName(scraper: Scraper):
# jsondata = scraper.getjsondata("nino-negri-5-stelle-sfursat-2022.html")
# assert jsondata["productName"] == "Nino Negri : 5 Stelle Sfursat 2022"
# assert isinstance(jsondata["items"], list)
# assert len(jsondata["items"]) > 0
# assert jsondata["items"][0]["offerPrice"] == 390
def test_appellation(scraper: Scraper):
appellation: ScraperData = scraper.getjsondata(
"nino-negri-5-stelle-sfursat-2022.html"
)
assert appellation.appellation() == "Sforzato di Valtellina"