Merge pull request #175 from MITLibraries/HRQB-49-historical-libhr-data

HRQB 49 - Include historical LibHR data
MITLibraries · Sep 18, 2024 · 63de91c · 63de91c
2 parents 5a35f77 + 6364f99
commit 63de91c
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 38 deletions.
diff --git a/hrqb/tasks/employee_appointments.py b/hrqb/tasks/employee_appointments.py
@@ -57,6 +57,9 @@ def get_dataframe(self) -> pd.DataFrame:
         libhr_df = self.named_inputs["ExtractQBLibHREmployeeAppointments"].read()
         depts_df = self.named_inputs["ExtractQBDepartments"].read()
 
+        # filter libhr data to active appointments, with position IDs
+        libhr_df = libhr_df[(libhr_df["Active"]) & ~(libhr_df["Position ID"].isna())]
+
         # normalize position id to string and pad zeros
         libhr_df["Position ID"] = libhr_df["Position ID"].apply(
             lambda x: str(int(x)).zfill(8)

diff --git a/hrqb/tasks/libhr_employee_appointments.py b/hrqb/tasks/libhr_employee_appointments.py
@@ -1,12 +1,14 @@
 """hrqb.tasks.libhr_employee_appointments"""
 
 import luigi  # type: ignore[import-untyped]
+import numpy as np
 import pandas as pd
 
 from hrqb.base.task import (
     PandasPickleTask,
     QuickbaseUpsertTask,
 )
+from hrqb.utils import md5_hash_from_values
 from hrqb.utils.quickbase import QBClient
 
 
@@ -23,7 +25,16 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask):
     csv_filepath = luigi.Parameter()
 
     def get_dataframe(self) -> pd.DataFrame:
-        return pd.read_csv(self.csv_filepath)
+        # read CSV file
+        libhr_df = pd.read_csv(self.csv_filepath)
+
+        # convert 'Active' column to Quickbase Yes/No checkbox value
+        # np.False_ and np.True_ values are the result of Excel --> CSV --> pandas
+        libhr_df["Active"] = libhr_df["Active"].replace(
+            {np.True_: "Yes", np.False_: "No"}
+        )
+
+        return libhr_df
 
 
 class ExtractQBDepartments(PandasPickleTask):
@@ -67,13 +78,26 @@ def get_dataframe(self) -> pd.DataFrame:
             how="left",
         )
 
+        # mint a unique, deterministic value for the merge "Key" field
+        libhr_df["Key"] = libhr_df.apply(
+            lambda row: md5_hash_from_values(
+                [
+                    str(row["MIT ID"]),
+                    str(row["HC ID"]),
+                ]
+            ),
+            axis=1,
+        )
+
         fields = {
             "MIT ID": "Related Employee MIT ID",
             "Supervisor ID": "Related Supervisor MIT ID",
             "Cost Object": "Cost Object",
             "HC ID": "HC ID",
             "Position ID": "Position ID",
             "Related Department ID": "Related Department ID",
+            "Active": "Active",
+            "Key": "Key",
         }
         return libhr_df[fields.keys()].rename(columns=fields)
 
@@ -86,7 +110,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask):
     @property
     def merge_field(self) -> str | None:
         """Explicitly merge on unique Position ID field."""
-        return "Position ID"
+        return "Key"
 
     def requires(self) -> list[luigi.Task]:  # pragma: nocover
         return [

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -653,6 +653,7 @@ def task_extract_qb_libhr_complete(all_tasks_pipeline_name):
                     "Position ID": 987654321,
                     "Cost Object": 7777777,
                     "Related Department ID": 42.0,
+                    "Active": True,
                 }
             ]
         )

diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv
@@ -1,4 +1,4 @@
-MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department
-123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC
-987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS
-987654321,L-100,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO
+MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active
+123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True
+987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True
+987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False
diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py
@@ -1,6 +1,5 @@
 # ruff: noqa: PD901, PLR2004
 
-import numpy as np
 import pandas as pd
 
 
@@ -18,39 +17,23 @@ def test_transform_libhr_employee_appointments_merge_departments(
     task_transform_libhr_employee_appointments,
 ):
     new_df = task_transform_libhr_employee_appointments.get_dataframe()
-    assert new_df.equals(
-        pd.DataFrame(
-            [
-                {
-                    "Related Employee MIT ID": 123456789,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-001",
-                    "Position ID": 888888888,
-                    "Related Department ID": 35.0,
-                },
-                {
-                    "Related Employee MIT ID": 987654321,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-100",
-                    "Position ID": 999999999,
-                    "Related Department ID": 40.0,
-                },
-                {
-                    "Related Employee MIT ID": 987654321,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-100",
-                    "Position ID": 999999991,
-                    "Related Department ID": np.nan,
-                },
-            ]
-        )
-    )
+    assert new_df.iloc[0]["Related Department ID"] == 35.0
+    assert new_df.iloc[1]["Related Department ID"] == 40.0
+    assert pd.isna(new_df.iloc[2]["Related Department ID"])  # no match in merge, so NULL
 
 
 def test_load_libhr_employee_appointments_merge_field_set(
     task_load_libhr_employee_appointments,
 ):
-    assert task_load_libhr_employee_appointments.merge_field == "Position ID"
+    assert task_load_libhr_employee_appointments.merge_field == "Key"
+
+
+def test_transform_libhr_employee_appointments_merge_field_key_values(
+    task_transform_libhr_employee_appointments,
+):
+    new_df = task_transform_libhr_employee_appointments.get_dataframe()
+    assert list(new_df["Key"]) == [
+        "81cf06bfd65aa1f7019750c57a79be99",
+        "6e07102ee39ec1f22c63231d090bd4dd",
+        "af08a24eeb35fae63fa76e755537b949",
+    ]