ajout: restructuration de la cleaning

2026-03-28 18:03:47 +00:00 · 2026-03-06 17:56:07 +01:00
parent 69b8b4ce1f
commit acf4ddd881
2 changed files with 121 additions and 133 deletions
--- a/tests/test_cleaning.py
+++ b/tests/test_cleaning.py
@@ -1,64 +1,68 @@
-import pandas as pd
 import pytest
 from pandas import DataFrame
-
-from cleaning import (
-    SCORE_COLS,
-    drop_empty_appellation,
-    mean_score,
-    fill_missing_scores,
-    encode_appellation,
-)
+from unittest.mock import patch, mock_open
+from cleaning import Cleaning


@pytest.fixture
-def df_raw() -> DataFrame:
-    return pd.DataFrame({
-        "Appellation": ["Pauillac", "Pauillac ", "Margaux", None, "Pomerol", "Pomerol"],
-        "Robert":      ["95", None, "bad", 90, None, None],
-        "Robinson":    [None, "93", 18, None, None, None],
-        "Suckling":    [96, None, None, None, 91, None],
-        "Prix":        ["10.0", "11.0", "20.0", "30.0", "40.0", "50.0"],
-    })
+def cleaning_raw() -> Cleaning:
+    """
+    "Appellation": ["Pauillac", "Pauillac ", "Margaux", None  , "Pomerol", "Pomerol"],
+    "Robert":      ["95"      , None       , "bad"    , 90    , None     , None     ],
+    "Robinson":    [None      , "93"       , 18       , None  , None     , None     ],
+    "Suckling":    [96        , None       , None     , None  , 91       , None     ],
+    "Prix":        ["10.0"    , "11.0"     , "20.0"   , "30.0", "40.0"   , "50.0"   ],
+    """
+    csv_content = """Appellation,Robert,Robinson,Suckling,Prix
+Pauillac,95,,96,10.0
+Pauillac ,,93,,11.0
+Margaux,bad,18,,20.0
+,90,,,30.0
+Pomerol,,,91,40.0
+Pomerol,,,,50.0
+"""
+    m = mock_open(read_data=csv_content)
+    with patch("builtins.open", m):
+        return Cleaning("donnee.csv")


-def test_drop_empty_appellation(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
+def test_drop_empty_appellation(cleaning_raw: Cleaning) -> None:
+    out = cleaning_raw.drop_empty_appellation().getVins()
    assert out["Appellation"].isna().sum() == 0
-    assert len(out) == 5 
+    assert len(out) == 5


-def test_mean_score_zero_when_no_scores(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    m = mean_score(out, "Robert")
+def test_mean_score_zero_when_no_scores(cleaning_raw: Cleaning) -> None:
+    out = cleaning_raw.drop_empty_appellation()
+    m = out._mean_score("Robert")
    assert list(m.columns) == ["Appellation", "mean_Robert"]
-
-    # Pomerol n'a aucune note Robert => moyenne doit être 0
-    pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[0]
+    pomerol_mean = m.loc[m["Appellation"].str.strip() == "Pomerol", "mean_Robert"].iloc[
+        0
+    ]
    assert pomerol_mean == 0


-def test_fill_missing_scores(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    filled = fill_missing_scores(out)
+def test_fill_missing_scores(cleaning_raw: Cleaning):
+    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()

-    # plus de NaN dans les colonnes de scores
-    for col in SCORE_COLS:
+    cleaning_raw.drop_empty_appellation()
+    filled = cleaning_raw.fill_missing_scores().getVins()
+    for col in cleaning_raw.SCORE_COLS:
        assert filled[col].isna().sum() == 0
-
-    assert filled.loc[1, "Robert"] == 95.0
-
-    # pas de colonnes temporaires mean_*
-    for col in SCORE_COLS:
-        assert f"mean_{col}" not in filled.columns
+        
+    pauillac_robert = filled[filled["Appellation"] == "Pauillac"]["Robert"]
+    assert (pauillac_robert == 95.0).all()


-def test_encode_appellation(df_raw: DataFrame):
-    out = drop_empty_appellation(df_raw)
-    filled = fill_missing_scores(out)
-    encoded = encode_appellation(filled)
+def test_encode_appellation(cleaning_raw: Cleaning):
+    cleaning_raw._vins["Appellation"] = cleaning_raw._vins["Appellation"].str.strip()

-    # la colonne texte disparaît
-    assert "Appellation" not in encoded.columns
-    assert "Pauillac" in encoded.columns
-    assert encoded.loc[0, "Pauillac"] == 1
+    out = (
+        cleaning_raw.drop_empty_appellation()
+        .fill_missing_scores()
+        .encode_appellation()
+        .getVins()
+    )
+    assert "Appellation" not in out.columns
+    assert "Pauillac" in out.columns
+    assert int(out.loc[0, "Pauillac"]) == 1