10 Commits

Author SHA1 Message Date
0f6eb856c6 ajout: restructuration des fichiers et modifications scraper 2026-03-01 19:39:57 +01:00
d62145e250 ajout: ajout fonction main 2026-02-16 13:56:55 +01:00
829c303e78 ajout: debut question 8 2026-02-16 13:36:17 +01:00
b584f9a301 remplacement: changer le fichiers *main par scraper 2026-02-16 13:19:13 +01:00
547c7ec4c1 ajout: 2e jalon 2026-02-16 13:02:45 +01:00
0aa765d6a0 fix: ajout commentaire en titre et bug sur les scores 2026-02-16 11:11:02 +01:00
8a357abe86 ajout(requirements.txt): ajout lib 2026-02-13 18:14:39 +01:00
DAHMANI chahrazad
2f5af5aabf Merge pull request #9 from guezoloic/exo7-loic
Exo7 loic
2026-02-13 17:58:41 +01:00
Loïc GUEZO
0182bbbf20 Merge pull request #7 from guezoloic/exo7+6
Exo7 sans exo6
2026-02-10 20:12:54 +01:00
DAHMANI chahrazad
8cae082344 Merge pull request #5 from guezoloic/exo3 (il manque les test !!)
Exo3
2026-02-09 18:57:39 +01:00
7 changed files with 74 additions and 14 deletions

15
pyproject.toml Normal file
View File

@@ -0,0 +1,15 @@
[project]
name = "projet-millesima-s6"
version = "0.1.0"
dependencies = [
"requests==2.32.5",
"beautifulsoup4==4.14.3",
"pandas==2.3.3",
]
[project.optional-dependencies]
test = ["pytest==8.4.2", "requests-mock==1.12.1"]
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

View File

@@ -1,3 +0,0 @@
requests>=2.32.5
requests-mock>=1.12.1
beautifulsoup4>=4.14.3

Binary file not shown.

20
src/main.py Executable file
View File

@@ -0,0 +1,20 @@
#!/usr/bin/env python3
from os import getcwd
from os.path import normpath, join
from sys import argv
from pandas import read_csv, DataFrame
def main() -> None:
if len(argv) != 2:
raise ValueError(f"{argv[0]} <filename.csv>")
path: str = normpath(join(getcwd(), argv[1]))
db: DataFrame = read_csv(path)
print(db.all())
if __name__ == "__main__":
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

46
main.py → src/scraper.py Normal file → Executable file
View File

@@ -1,3 +1,6 @@
#!/usr/bin/env python3
from sys import argv
from typing import cast
from requests import HTTPError, Response, Session
from bs4 import BeautifulSoup, Tag
@@ -6,8 +9,8 @@ from json import JSONDecodeError, loads
class _ScraperData:
"""_summary_
"""
"""_summary_"""
def __init__(self, data: dict[str, object]) -> None:
"""_summary_
@@ -121,7 +124,7 @@ class _ScraperData:
val = cast(str, app_dict.get("value")).rstrip("+").split("-")
if len(val) > 1 and val[1] != "":
val[0] = str((int(val[0]) + int(val[1])) / 2)
val[0] = str(round((float(val[0]) + float(val[1])) / 2, 1))
return val[0]
return None
@@ -171,6 +174,18 @@ class Scraper:
# Très utile pour éviter de renvoyer toujours les mêmes handshake
# TCP et d'avoir toujours une connexion constante avec le server
self._session: Session = Session()
# Crée une "fausse carte d'identité" pour éviter que le site nous
# bloque car on serait des robots
self._session.headers.update(
{
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/122.0.0.0 Safari/537.36",
"Accept-Language":
"fr-FR,fr;q=0.9,en;q=0.8",
}
)
# Système de cache pour éviter de solliciter le serveur inutilement
self._latest_request: tuple[(str, Response)] | None = None
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
@@ -191,7 +206,8 @@ class Scraper:
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
"""
target_url: str = self._url + subdir.lstrip("/")
response: Response = self._session.get(url=target_url, timeout=10)
# envoyer une requête GET sur la page si erreur, renvoie un raise
response: Response = self._session.get(url=target_url, timeout=30)
response.raise_for_status()
return response
@@ -291,7 +307,7 @@ class Scraper:
return _ScraperData(cast(dict[str, object], current_data))
def _geturlproductslist(self, subdir: str):
def _geturlproductslist(self, subdir: str) -> list[str] | None:
"""_summary_
Args:
@@ -315,20 +331,22 @@ class Scraper:
except (JSONDecodeError, HTTPError):
return None
def getvins(self, subdir: str, filename: str):
def getvins(self, subdir: str, filename: str) -> None:
"""_summary_
Args:
subdir (str): _description_
filename (str): _description_
"""
with open(filename, "a") as f:
with open(filename, "w") as f:
cache: set[str] = set[str]()
page = 0
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
while True:
page += 1
products_list = self._geturlproductslist(f"{subdir}?page={page}")
products_list: list[str] | None = \
self._geturlproductslist(f"{subdir}?page={page}")
if not products_list:
break
@@ -353,5 +371,15 @@ class Scraper:
f.flush()
def main() -> None:
if len(argv) != 2:
raise ValueError(f"{argv[0]} <sous-url>")
scraper: Scraper = Scraper()
scraper.getvins(argv[1], "donnee.csv")
if __name__ == "__main__":
Scraper().getvins("bordeaux.html", "donnee.csv")
try:
main()
except Exception as e:
print(f"ERREUR: {e}")

0
tests/test_main.py Normal file
View File

View File

@@ -2,7 +2,7 @@ from json import dumps
from unittest.mock import patch, mock_open
import pytest
from requests_mock import Mocker
from main import Scraper
from scraper import Scraper
@pytest.fixture(autouse=True)
@@ -138,7 +138,7 @@ def mock_site():
"isSpirit": False,
},
"note_js": {
"valueId": "93-94",
"valueId": "93-94.5",
"name": "J. cherazade",
"value": "93-94",
"isSpirit": False,