project_for_kamlesh/tests/test_csv_grok.py

"""
Unit tests for src/csv_grok.py
================================
Every numeric assertion is verified by an independent second method so
the test itself is trustworthy and not just a tautology.

Independent verification strategy
-----------------------------------
- Means are cross-checked via  sum(values) / len(values)  computed
  directly from the raw fixture lists — no pandas involved in the
  reference calculation.
- Combined row count is verified by simple integer addition.
- File content checks use plain string search, not the module's own
  formatting helpers.
"""

from __future__ import annotations

import math
from pathlib import Path

import pandas as pd
import pytest

# Make src/ importable without installing the package.
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from csv_grok import (
    AnalysisResult,
    CsvBundle,
    analyse,
    load_and_combine,
    load_csv,
    write_report,
)

# ---------------------------------------------------------------------------
# Paths to the shared fixture CSVs
# ---------------------------------------------------------------------------

DATA_DIR = Path(__file__).parent.parent / "data"
FILE1 = DATA_DIR / "file1.csv"
FILE2 = DATA_DIR / "file2.csv"

# ---------------------------------------------------------------------------
# Ground-truth values derived by hand from the fixture files
# (independent of pandas – used as the reference in assertions)
# ---------------------------------------------------------------------------

# file1.csv  score column: Alice=88.5, Bob=91.0, Carol=76.3, Dave=83.7, Eve=95.2
FILE1_SCORES = [88.5, 91.0, 76.3, 83.7, 95.2]
FILE1_SCORE_MEAN_REF = sum(FILE1_SCORES) / len(FILE1_SCORES)   # 86.94

# file2.csv  score column: Frank=70.1, Grace=88.9, Hank=65.4, Iris=79.8, Jack=82.3
FILE2_SCORES = [70.1, 88.9, 65.4, 79.8, 82.3]
FILE2_SCORE_MEAN_REF = sum(FILE2_SCORES) / len(FILE2_SCORES)   # 77.30

# file1.csv  salary column
FILE1_SALARIES = [72000, 65000, 85000, 70000, 90000]
FILE1_SALARY_MEAN_REF = sum(FILE1_SALARIES) / len(FILE1_SALARIES)  # 76400.0

# file2.csv  salary column
FILE2_SALARIES = [95000, 58000, 110000, 68000, 88000]
FILE2_SALARY_MEAN_REF = sum(FILE2_SALARIES) / len(FILE2_SALARIES)  # 83800.0


# ===========================================================================
# Tests for load_csv()
# ===========================================================================

class TestLoadCsv:
    """Happy-path and structural checks for load_csv()."""

    def test_returns_csv_bundle(self):
        """load_csv() must return a CsvBundle instance."""
        bundle = load_csv(FILE1)
        assert isinstance(bundle, CsvBundle)

    def test_path_attribute_preserved(self):
        """The bundle's .path must equal the path that was passed in."""
        bundle = load_csv(FILE1)
        assert bundle.path == FILE1

    def test_row_count_file1(self):
        """file1.csv has 5 data rows."""
        bundle = load_csv(FILE1)
        assert len(bundle.data) == 5

    def test_row_count_file2(self):
        """file2.csv has 5 data rows."""
        bundle = load_csv(FILE2)
        assert len(bundle.data) == 5

    def test_column_count(self):
        """Both files have exactly 4 columns."""
        for path in (FILE1, FILE2):
            bundle = load_csv(path)
            assert len(bundle.data.columns) == 4, (
                f"{path.name} should have 4 columns, got {list(bundle.data.columns)}"
            )

    def test_expected_columns_present(self):
        """Columns name, age, score, salary must all be present."""
        expected = {"name", "age", "score", "salary"}
        for path in (FILE1, FILE2):
            bundle = load_csv(path)
            assert expected == set(bundle.data.columns)

    def test_data_is_dataframe(self):
        """bundle.data must be a pandas DataFrame."""
        bundle = load_csv(FILE1)
        assert isinstance(bundle.data, pd.DataFrame)

    def test_missing_file_raises(self, tmp_path):
        """load_csv() must raise when the file does not exist."""
        with pytest.raises(Exception):
            load_csv(tmp_path / "nonexistent.csv")


# ===========================================================================
# Tests for load_and_combine()
# ===========================================================================

class TestLoadAndCombine:
    """Tests for the Combine (union) pipeline step."""

    def test_returns_three_tuple(self):
        """load_and_combine() must return a 3-tuple."""
        result = load_and_combine(FILE1, FILE2)
        assert len(result) == 3

    def test_bundles_are_csv_bundles(self):
        """First two elements of the tuple must be CsvBundle instances."""
        b1, b2, _ = load_and_combine(FILE1, FILE2)
        assert isinstance(b1, CsvBundle)
        assert isinstance(b2, CsvBundle)

    def test_combined_is_dataframe(self):
        """Third element must be a pandas DataFrame."""
        _, _, combined = load_and_combine(FILE1, FILE2)
        assert isinstance(combined, pd.DataFrame)

    def test_combined_row_count(self):
        """Combined DataFrame must have len(file1) + len(file2) rows.

        Independent check: load each file separately with pd.read_csv and
        add their lengths — no call to load_and_combine() in the reference.
        """
        b1, b2, combined = load_and_combine(FILE1, FILE2)

        # Reference: independent row counts via direct pd.read_csv
        ref_rows = len(pd.read_csv(FILE1)) + len(pd.read_csv(FILE2))

        assert len(combined) == ref_rows
        assert len(combined) == len(b1.data) + len(b2.data)

    def test_combined_preserves_columns(self):
        """Combined DataFrame must retain all 4 original columns."""
        _, _, combined = load_and_combine(FILE1, FILE2)
        assert set(combined.columns) == {"name", "age", "score", "salary"}

    def test_combined_index_is_reset(self):
        """Combined DataFrame index must be 0-based and contiguous."""
        _, _, combined = load_and_combine(FILE1, FILE2)
        expected_index = list(range(len(combined)))
        assert list(combined.index) == expected_index

    def test_bundle_paths_are_correct(self):
        """Each bundle must carry the path it was loaded from."""
        b1, b2, _ = load_and_combine(FILE1, FILE2)
        assert b1.path == FILE1
        assert b2.path == FILE2


# ===========================================================================
# Tests for analyse()
# ===========================================================================

class TestAnalyse:
    """Tests for the Analyse pipeline step — means and difference."""

    @pytest.fixture(autouse=True)
    def _bundles(self):
        self.b1, self.b2, _ = load_and_combine(FILE1, FILE2)

    # --- happy-path: score column -------------------------------------------

    def test_returns_analysis_result(self):
        """analyse() must return an AnalysisResult instance."""
        result = analyse(self.b1, self.b2, "score")
        assert isinstance(result, AnalysisResult)

    def test_column_attribute(self):
        """result.column must equal the column name passed in."""
        result = analyse(self.b1, self.b2, "score")
        assert result.column == "score"

    def test_mean_file1_score(self):
        """mean_file1 for 'score' must match the hand-computed reference.

        Reference: sum(FILE1_SCORES) / len(FILE1_SCORES) — no pandas.
        """
        result = analyse(self.b1, self.b2, "score")
        assert math.isclose(result.mean_file1, FILE1_SCORE_MEAN_REF, rel_tol=1e-9)

    def test_mean_file2_score(self):
        """mean_file2 for 'score' must match the hand-computed reference."""
        result = analyse(self.b1, self.b2, "score")
        assert math.isclose(result.mean_file2, FILE2_SCORE_MEAN_REF, rel_tol=1e-9)

    def test_difference_score(self):
        """difference must equal mean_file1 − mean_file2 (verified independently)."""
        result = analyse(self.b1, self.b2, "score")
        expected_diff = FILE1_SCORE_MEAN_REF - FILE2_SCORE_MEAN_REF
        assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)
        # Also verify the internal consistency of the dataclass fields
        assert math.isclose(result.difference, result.mean_file1 - result.mean_file2, rel_tol=1e-9)

    # --- happy-path: salary column ------------------------------------------

    def test_mean_file1_salary(self):
        """mean_file1 for 'salary' must match the hand-computed reference."""
        result = analyse(self.b1, self.b2, "salary")
        assert math.isclose(result.mean_file1, FILE1_SALARY_MEAN_REF, rel_tol=1e-9)

    def test_mean_file2_salary(self):
        """mean_file2 for 'salary' must match the hand-computed reference."""
        result = analyse(self.b1, self.b2, "salary")
        assert math.isclose(result.mean_file2, FILE2_SALARY_MEAN_REF, rel_tol=1e-9)

    def test_difference_salary(self):
        """difference for 'salary' must equal mean_file1 − mean_file2."""
        result = analyse(self.b1, self.b2, "salary")
        expected_diff = FILE1_SALARY_MEAN_REF - FILE2_SALARY_MEAN_REF
        assert math.isclose(result.difference, expected_diff, rel_tol=1e-9)

    # --- error path ---------------------------------------------------------

    def test_missing_column_raises_value_error(self):
        """analyse() must raise ValueError for a column that does not exist."""
        with pytest.raises(ValueError, match="not found"):
            analyse(self.b1, self.b2, "nonexistent_column")

    def test_error_message_contains_column_name(self):
        """The ValueError message must name the missing column."""
        bad_col = "ghost_column"
        with pytest.raises(ValueError, match=bad_col):
            analyse(self.b1, self.b2, bad_col)

    def test_non_numeric_column_raises_or_returns_nan(self):
        """Requesting the 'name' (string) column should either raise or return NaN mean."""
        try:
            result = analyse(self.b1, self.b2, "name")
            # If it doesn't raise, the means must be NaN (pandas behaviour for strings)
            assert math.isnan(result.mean_file1) or math.isnan(result.mean_file2)
        except (TypeError, ValueError):
            pass  # raising is also acceptable


# ===========================================================================
# Tests for write_report()
# ===========================================================================

class TestWriteReport:
    """Tests for the report-writing step."""

    @pytest.fixture()
    def result(self):
        b1, b2, _ = load_and_combine(FILE1, FILE2)
        return analyse(b1, b2, "score")

    def test_file_is_created(self, tmp_path, result):
        """write_report() must create the output file."""
        out = tmp_path / "report.txt"
        write_report(result, out)
        assert out.exists()

    def test_file_contains_column_name(self, tmp_path, result):
        """The report must mention the column that was analysed."""
        out = tmp_path / "report.txt"
        write_report(result, out)
        assert "score" in out.read_text()

    def test_file_contains_mean1(self, tmp_path, result):
        """The report must contain the mean of file 1 (to 4 decimal places)."""
        out = tmp_path / "report.txt"
        write_report(result, out)
        # Independent reference: format the hand-computed value the same way
        expected_str = f"{FILE1_SCORE_MEAN_REF:.4f}"
        assert expected_str in out.read_text()

    def test_file_contains_mean2(self, tmp_path, result):
        """The report must contain the mean of file 2 (to 4 decimal places)."""
        out = tmp_path / "report.txt"
        write_report(result, out)
        expected_str = f"{FILE2_SCORE_MEAN_REF:.4f}"
        assert expected_str in out.read_text()

    def test_file_is_non_empty(self, tmp_path, result):
        """The report file must not be empty."""
        out = tmp_path / "report.txt"
        write_report(result, out)
        assert out.stat().st_size > 0