From f8cbabb7d04224d274af07f5238fc92d6a88cc51 Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Tue, 17 Sep 2024 16:11:43 -0400
Subject: [PATCH 1/2] Upate LibHR transform and load

Why these changes are being introduced:

A new requirement was added that table 'LibHR Employee Appointments'
could have multiple rows for the same employee, but different headcount
IDs (HC ID).  This would represent previous positions they held.

Related, this Quickbase table would need to support rows for
old headcount IDs for employees, requiring a way to distinguish
'active' from 'inactive' rows.

How this addresses that need:
* Added new 'Active' column to Quickbase table 'LibHR Employee Appointments'
* Add new 'Key' field to same table that is used for merge updates
  * this field is MD5 of MIT ID + Headcount ID (HC ID)

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/HRQB-49
---
 hrqb/tasks/libhr_employee_appointments.py     | 28 ++++++++++-
 tests/fixtures/libhr_static_data.csv          |  8 ++--
 .../tasks/test_libhr_employee_appointments.py | 47 ++++++-------------
 3 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/hrqb/tasks/libhr_employee_appointments.py b/hrqb/tasks/libhr_employee_appointments.py
index b77c34e..3796431 100644
--- a/hrqb/tasks/libhr_employee_appointments.py
+++ b/hrqb/tasks/libhr_employee_appointments.py
@@ -1,12 +1,14 @@
 """hrqb.tasks.libhr_employee_appointments"""
 
 import luigi  # type: ignore[import-untyped]
+import numpy as np
 import pandas as pd
 
 from hrqb.base.task import (
     PandasPickleTask,
     QuickbaseUpsertTask,
 )
+from hrqb.utils import md5_hash_from_values
 from hrqb.utils.quickbase import QBClient
 
 
@@ -23,7 +25,16 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask):
     csv_filepath = luigi.Parameter()
 
     def get_dataframe(self) -> pd.DataFrame:
-        return pd.read_csv(self.csv_filepath)
+        # read CSV file
+        libhr_df = pd.read_csv(self.csv_filepath)
+
+        # convert 'Active' column to Quickbase Yes/No checkbox value
+        # np.False_ and np.True_ values are the result of Excel --> CSV --> pandas
+        libhr_df["Active"] = libhr_df["Active"].replace(
+            {np.True_: "Yes", np.False_: "No"}
+        )
+
+        return libhr_df
 
 
 class ExtractQBDepartments(PandasPickleTask):
@@ -67,6 +78,17 @@ def get_dataframe(self) -> pd.DataFrame:
             how="left",
         )
 
+        # mint a unique, deterministic value for the merge "Key" field
+        libhr_df["Key"] = libhr_df.apply(
+            lambda row: md5_hash_from_values(
+                [
+                    str(row["MIT ID"]),
+                    str(row["HC ID"]),
+                ]
+            ),
+            axis=1,
+        )
+
         fields = {
             "MIT ID": "Related Employee MIT ID",
             "Supervisor ID": "Related Supervisor MIT ID",
@@ -74,6 +96,8 @@ def get_dataframe(self) -> pd.DataFrame:
             "HC ID": "HC ID",
             "Position ID": "Position ID",
             "Related Department ID": "Related Department ID",
+            "Active": "Active",
+            "Key": "Key",
         }
         return libhr_df[fields.keys()].rename(columns=fields)
 
@@ -86,7 +110,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask):
     @property
     def merge_field(self) -> str | None:
         """Explicitly merge on unique Position ID field."""
-        return "Position ID"
+        return "Key"
 
     def requires(self) -> list[luigi.Task]:  # pragma: nocover
         return [
diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv
index 0f0556c..7e73448 100644
--- a/tests/fixtures/libhr_static_data.csv
+++ b/tests/fixtures/libhr_static_data.csv
@@ -1,4 +1,4 @@
-MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department
-123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC
-987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS
-987654321,L-100,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO
\ No newline at end of file
+MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active
+123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True
+987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True
+987654321,L-101,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,True
\ No newline at end of file
diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py
index f763eed..d3d16bf 100644
--- a/tests/tasks/test_libhr_employee_appointments.py
+++ b/tests/tasks/test_libhr_employee_appointments.py
@@ -1,6 +1,5 @@
 # ruff: noqa: PD901, PLR2004
 
-import numpy as np
 import pandas as pd
 
 
@@ -18,39 +17,23 @@ def test_transform_libhr_employee_appointments_merge_departments(
     task_transform_libhr_employee_appointments,
 ):
     new_df = task_transform_libhr_employee_appointments.get_dataframe()
-    assert new_df.equals(
-        pd.DataFrame(
-            [
-                {
-                    "Related Employee MIT ID": 123456789,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-001",
-                    "Position ID": 888888888,
-                    "Related Department ID": 35.0,
-                },
-                {
-                    "Related Employee MIT ID": 987654321,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-100",
-                    "Position ID": 999999999,
-                    "Related Department ID": 40.0,
-                },
-                {
-                    "Related Employee MIT ID": 987654321,
-                    "Related Supervisor MIT ID": 444444444,
-                    "Cost Object": 555555555,
-                    "HC ID": "L-100",
-                    "Position ID": 999999991,
-                    "Related Department ID": np.nan,
-                },
-            ]
-        )
-    )
+    assert new_df.iloc[0]["Related Department ID"] == 35.0
+    assert new_df.iloc[1]["Related Department ID"] == 40.0
+    assert pd.isna(new_df.iloc[2]["Related Department ID"])  # no match in merge, so NULL
 
 
 def test_load_libhr_employee_appointments_merge_field_set(
     task_load_libhr_employee_appointments,
 ):
-    assert task_load_libhr_employee_appointments.merge_field == "Position ID"
+    assert task_load_libhr_employee_appointments.merge_field == "Key"
+
+
+def test_transform_libhr_employee_appointments_merge_field_key_values(
+    task_transform_libhr_employee_appointments,
+):
+    new_df = task_transform_libhr_employee_appointments.get_dataframe()
+    assert list(new_df["Key"]) == [
+        "81cf06bfd65aa1f7019750c57a79be99",
+        "6e07102ee39ec1f22c63231d090bd4dd",
+        "744aefdd46c40523d60cf69490d81655",
+    ]

From 6364f993104eb71cbcff1953526cfd9737af2ceb Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Wed, 18 Sep 2024 10:01:58 -0400
Subject: [PATCH 2/2] Employee Appointments join on active LibHR Data

Why these changes are being introduced:

Now that the Quickbase table 'LibHR Employee Appointments' has historical data
(headcount ids for previous employee appointments), it is important that
Employee Appointments only join on 'active' LibHR data.

How this addresses that need:
* For task TransformEmployeeAppointments, only join on active LibHR Employee
Appointment rows
* Update fixture to simulate inactive/historical data in LibHR data

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/HRQB-49
---
 hrqb/tasks/employee_appointments.py             | 3 +++
 tests/conftest.py                               | 1 +
 tests/fixtures/libhr_static_data.csv            | 2 +-
 tests/tasks/test_libhr_employee_appointments.py | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hrqb/tasks/employee_appointments.py b/hrqb/tasks/employee_appointments.py
index 664c86d..982b636 100644
--- a/hrqb/tasks/employee_appointments.py
+++ b/hrqb/tasks/employee_appointments.py
@@ -57,6 +57,9 @@ def get_dataframe(self) -> pd.DataFrame:
         libhr_df = self.named_inputs["ExtractQBLibHREmployeeAppointments"].read()
         depts_df = self.named_inputs["ExtractQBDepartments"].read()
 
+        # filter libhr data to active appointments, with position IDs
+        libhr_df = libhr_df[(libhr_df["Active"]) & ~(libhr_df["Position ID"].isna())]
+
         # normalize position id to string and pad zeros
         libhr_df["Position ID"] = libhr_df["Position ID"].apply(
             lambda x: str(int(x)).zfill(8)
diff --git a/tests/conftest.py b/tests/conftest.py
index ae1a0a3..542dca2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -653,6 +653,7 @@ def task_extract_qb_libhr_complete(all_tasks_pipeline_name):
                     "Position ID": 987654321,
                     "Cost Object": 7777777,
                     "Related Department ID": 42.0,
+                    "Active": True,
                 }
             ]
         )
diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv
index 7e73448..a8be8cc 100644
--- a/tests/fixtures/libhr_static_data.csv
+++ b/tests/fixtures/libhr_static_data.csv
@@ -1,4 +1,4 @@
 MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active
 123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True
 987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True
-987654321,L-101,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,True
\ No newline at end of file
+987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False
\ No newline at end of file
diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py
index d3d16bf..d441fc3 100644
--- a/tests/tasks/test_libhr_employee_appointments.py
+++ b/tests/tasks/test_libhr_employee_appointments.py
@@ -35,5 +35,5 @@ def test_transform_libhr_employee_appointments_merge_field_key_values(
     assert list(new_df["Key"]) == [
         "81cf06bfd65aa1f7019750c57a79be99",
         "6e07102ee39ec1f22c63231d090bd4dd",
-        "744aefdd46c40523d60cf69490d81655",
+        "af08a24eeb35fae63fa76e755537b949",
     ]