mirror of
https://github.com/guezoloic/millesima_projetS6.git
synced 2026-03-29 03:23:47 +00:00
Compare commits
10 Commits
exo7-loic
...
0f6eb856c6
| Author | SHA1 | Date | |
|---|---|---|---|
| 0f6eb856c6 | |||
| d62145e250 | |||
| 829c303e78 | |||
| b584f9a301 | |||
| 547c7ec4c1 | |||
| 0aa765d6a0 | |||
| 8a357abe86 | |||
|
|
2f5af5aabf | ||
|
|
0182bbbf20 | ||
|
|
8cae082344 |
15
pyproject.toml
Normal file
15
pyproject.toml
Normal file
@@ -0,0 +1,15 @@
|
||||
[project]
|
||||
name = "projet-millesima-s6"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"requests==2.32.5",
|
||||
"beautifulsoup4==4.14.3",
|
||||
"pandas==2.3.3",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
test = ["pytest==8.4.2", "requests-mock==1.12.1"]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
@@ -1,3 +0,0 @@
|
||||
requests>=2.32.5
|
||||
requests-mock>=1.12.1
|
||||
beautifulsoup4>=4.14.3
|
||||
Binary file not shown.
20
src/main.py
Executable file
20
src/main.py
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from os import getcwd
|
||||
from os.path import normpath, join
|
||||
from sys import argv
|
||||
from pandas import read_csv, DataFrame
|
||||
|
||||
def main() -> None:
|
||||
if len(argv) != 2:
|
||||
raise ValueError(f"{argv[0]} <filename.csv>")
|
||||
|
||||
path: str = normpath(join(getcwd(), argv[1]))
|
||||
db: DataFrame = read_csv(path)
|
||||
print(db.all())
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f"ERREUR: {e}")
|
||||
46
main.py → src/scraper.py
Normal file → Executable file
46
main.py → src/scraper.py
Normal file → Executable file
@@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from sys import argv
|
||||
from typing import cast
|
||||
from requests import HTTPError, Response, Session
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
@@ -6,8 +9,8 @@ from json import JSONDecodeError, loads
|
||||
|
||||
|
||||
class _ScraperData:
|
||||
"""_summary_
|
||||
"""
|
||||
"""_summary_"""
|
||||
|
||||
def __init__(self, data: dict[str, object]) -> None:
|
||||
"""_summary_
|
||||
|
||||
@@ -121,7 +124,7 @@ class _ScraperData:
|
||||
|
||||
val = cast(str, app_dict.get("value")).rstrip("+").split("-")
|
||||
if len(val) > 1 and val[1] != "":
|
||||
val[0] = str((int(val[0]) + int(val[1])) / 2)
|
||||
val[0] = str(round((float(val[0]) + float(val[1])) / 2, 1))
|
||||
|
||||
return val[0]
|
||||
return None
|
||||
@@ -171,6 +174,18 @@ class Scraper:
|
||||
# Très utile pour éviter de renvoyer toujours les mêmes handshake
|
||||
# TCP et d'avoir toujours une connexion constante avec le server
|
||||
self._session: Session = Session()
|
||||
# Crée une "fausse carte d'identité" pour éviter que le site nous
|
||||
# bloque car on serait des robots
|
||||
self._session.headers.update(
|
||||
{
|
||||
"User-Agent":
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
|
||||
AppleWebKit/537.36 (KHTML, like Gecko) \
|
||||
Chrome/122.0.0.0 Safari/537.36",
|
||||
"Accept-Language":
|
||||
"fr-FR,fr;q=0.9,en;q=0.8",
|
||||
}
|
||||
)
|
||||
# Système de cache pour éviter de solliciter le serveur inutilement
|
||||
self._latest_request: tuple[(str, Response)] | None = None
|
||||
self._latest_soups: OrderedDict[str, BeautifulSoup] = OrderedDict[
|
||||
@@ -191,7 +206,8 @@ class Scraper:
|
||||
HTTPError: Si le serveur renvoie un code d'erreur (4xx, 5xx).
|
||||
"""
|
||||
target_url: str = self._url + subdir.lstrip("/")
|
||||
response: Response = self._session.get(url=target_url, timeout=10)
|
||||
# envoyer une requête GET sur la page si erreur, renvoie un raise
|
||||
response: Response = self._session.get(url=target_url, timeout=30)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
@@ -291,7 +307,7 @@ class Scraper:
|
||||
|
||||
return _ScraperData(cast(dict[str, object], current_data))
|
||||
|
||||
def _geturlproductslist(self, subdir: str):
|
||||
def _geturlproductslist(self, subdir: str) -> list[str] | None:
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
@@ -315,20 +331,22 @@ class Scraper:
|
||||
except (JSONDecodeError, HTTPError):
|
||||
return None
|
||||
|
||||
def getvins(self, subdir: str, filename: str):
|
||||
def getvins(self, subdir: str, filename: str) -> None:
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
subdir (str): _description_
|
||||
filename (str): _description_
|
||||
"""
|
||||
with open(filename, "a") as f:
|
||||
with open(filename, "w") as f:
|
||||
cache: set[str] = set[str]()
|
||||
page = 0
|
||||
_ = f.write("Appellation,Robert,Robinson,Suckling,Prix\n")
|
||||
|
||||
while True:
|
||||
page += 1
|
||||
products_list = self._geturlproductslist(f"{subdir}?page={page}")
|
||||
products_list: list[str] | None = \
|
||||
self._geturlproductslist(f"{subdir}?page={page}")
|
||||
|
||||
if not products_list:
|
||||
break
|
||||
@@ -353,5 +371,15 @@ class Scraper:
|
||||
f.flush()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(argv) != 2:
|
||||
raise ValueError(f"{argv[0]} <sous-url>")
|
||||
scraper: Scraper = Scraper()
|
||||
scraper.getvins(argv[1], "donnee.csv")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
Scraper().getvins("bordeaux.html", "donnee.csv")
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
print(f"ERREUR: {e}")
|
||||
0
tests/test_main.py
Normal file
0
tests/test_main.py
Normal file
@@ -2,7 +2,7 @@ from json import dumps
|
||||
from unittest.mock import patch, mock_open
|
||||
import pytest
|
||||
from requests_mock import Mocker
|
||||
from main import Scraper
|
||||
from scraper import Scraper
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -138,7 +138,7 @@ def mock_site():
|
||||
"isSpirit": False,
|
||||
},
|
||||
"note_js": {
|
||||
"valueId": "93-94",
|
||||
"valueId": "93-94.5",
|
||||
"name": "J. cherazade",
|
||||
"value": "93-94",
|
||||
"isSpirit": False,
|
||||
Reference in New Issue
Block a user