WIP: partial updates, waiting for test CSV

MITLibraries · Nov 25, 2024 · 6644b75 · 6644b75
1 parent 98a72a6
commit 6644b75
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 15 deletions.
diff --git a/hrqb/tasks/libhr_employee_appointments.py b/hrqb/tasks/libhr_employee_appointments.py
@@ -1,14 +1,23 @@
-"""hrqb.tasks.libhr_employee_appointments"""
+"""hrqb.tasks.libhr_employee_appointments
+
+LibHR data is data known only to Library HR staff, and cannot be found in the data
+warehouse.  Other than some bulk inserts and updates performed by this task, this data
+is mostly managed directly in Quickbase by Library HR staff.  The data in the table
+"LibHR Employee Appointments" is used to augment other Quickbase tables, and drives
+some reports of its own.
+"""
+
+import datetime
+import re
 
 import luigi  # type: ignore[import-untyped]
-import numpy as np
 import pandas as pd
 
 from hrqb.base.task import (
     PandasPickleTask,
     QuickbaseUpsertTask,
 )
-from hrqb.utils import md5_hash_from_values
+from hrqb.utils import convert_dataframe_columns_to_dates, md5_hash_from_values
 from hrqb.utils.quickbase import QBClient
 
 
@@ -18,21 +27,48 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask):
     This task is expecting the CSV to be a local filepath.  Unlike other pipelines in this
     client, this pipeline is rarely run, and is suitable for local, developer runs to load
     data.
+
+    Expected schema of CSV file:
+        - MIT ID: MIT ID
+        - HC ID: pattern of "L-###"
+        - Full Name (optional)
+        - Position (optional)
+        - Position ID: int
+        - Employee Type (optional)
+        - Supervisor ID: MIT ID
+        - Supervisor Name (optional)
+        - Cost Object
+        - Department: acronym
+        - Begin Date: begin date when Headcount ID (HC ID) applied
+        - End Date: end date when Headcount ID (HC ID) applied
     """
 
     pipeline = luigi.Parameter()
     stage = luigi.Parameter("Extract")
     csv_filepath = luigi.Parameter()
 
     def get_dataframe(self) -> pd.DataFrame:
-        # read CSV file
         libhr_df = pd.read_csv(self.csv_filepath)
 
-        # convert 'Active' column to Quickbase Yes/No checkbox value
-        # np.False_ and np.True_ values are the result of Excel --> CSV --> pandas
-        libhr_df["Active"] = libhr_df["Active"].replace(
-            {np.True_: "Yes", np.False_: "No"}
+        # convert Begin and End dates and set "Active" column
+        libhr_df = convert_dataframe_columns_to_dates(
+            libhr_df, columns=["Begin Date", "End Date"]
         )
+        libhr_df["Active"] = libhr_df["End Date"].apply(
+            lambda end_date: (
+                "Yes" if end_date >= datetime.datetime.now(tz=datetime.UTC) else "No"
+            )
+        )
+
+        # remove any suffixes from L-### headcount id, throwing an exception if absent
+        def remove_headcount_id_suffixes(original_headcount_id: str | None) -> str | None:
+            if original_headcount_id is None:
+                message = "LibHR CSV data is missing a Headcount ID for one or more rows."
+                raise ValueError(message)
+            matched_object = re.match(r"L-\d\d\d", original_headcount_id)
+            return matched_object.group(0) if matched_object else None
+
+        libhr_df["HC ID"] = libhr_df["HC ID"].apply(remove_headcount_id_suffixes)
 
         return libhr_df
 
@@ -98,6 +134,8 @@ def get_dataframe(self) -> pd.DataFrame:
             "Related Department ID": "Related Department ID",
             "Active": "Active",
             "Key": "Key",
+            "Begin Date": "Begin Date",
+            "End Date": "End Date",
         }
         return libhr_df[fields.keys()].rename(columns=fields)
 
@@ -109,7 +147,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask):
 
     @property
     def merge_field(self) -> str | None:
-        """Explicitly merge on unique Position ID field."""
+        """Explicitly merge on unique Key field."""
         return "Key"
 
     def requires(self) -> list[luigi.Task]:  # pragma: nocover

diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv
@@ -1,4 +1,6 @@
-MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active
-123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True
-987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True
-987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False
+MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Begin Date,End Date,Development Notes
+123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,2022-02-02,2999-12-31,
+987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,2021-01-01,2999-12-31,Represents employee Jane Doe getting a new headcount id when changing positions
+987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,2020-01-01,2020-12-31,"Represents a headcount id that  ended, and was picked up by a new employee."
+987654320,L-101,"Doe, Alice",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,2021-06-01,2999-12-31,"Represents a headcount id that was picked up and is still active, but headcount id L-101 was vacant for six months."
+343434343,L-050x,"Doe, Stan",Music Librarian,777777777,,333333333,"Lee, Pauline",444444444,IDLA,2023-07-15,2024-02-01,Represents a still vacant headcount id.
diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py
@@ -1,4 +1,5 @@
 # ruff: noqa: PD901, PLR2004
+import datetime
 
 import pandas as pd
 
@@ -8,9 +9,20 @@ def test_extract_libhr_employee_appointments_read_csv(
 ):
     df = task_extract_libhr_employee_appointments.get_dataframe()
     assert isinstance(df, pd.DataFrame)
-    assert len(df) == 3
+    assert len(df) == 5
     assert df.iloc[0]["MIT ID"] == 123456789
     assert df.iloc[0]["Supervisor ID"] == 444444444
+    assert df.iloc[0]["Begin Date"] == datetime.datetime(2022, 2, 2, tzinfo=datetime.UTC)
+    assert df.iloc[0]["End Date"] == datetime.datetime(2999, 12, 31, tzinfo=datetime.UTC)
+    assert df.iloc[0]["Active"]
+
+
+def test_extract_libhr_employee_appointments_strips_headcount_id_suffix(
+    task_extract_libhr_employee_appointments,
+):
+    df = task_extract_libhr_employee_appointments.get_dataframe()
+    row = df.iloc[2]
+    assert row["HC ID"] == "L-101"  # where original CSV has "L-101x"
 
 
 def test_transform_libhr_employee_appointments_merge_departments(
@@ -35,5 +47,7 @@ def test_transform_libhr_employee_appointments_merge_field_key_values(
     assert list(new_df["Key"]) == [
         "81cf06bfd65aa1f7019750c57a79be99",
         "6e07102ee39ec1f22c63231d090bd4dd",
-        "af08a24eeb35fae63fa76e755537b949",
+        "744aefdd46c40523d60cf69490d81655",
+        "4af9e7cd8c25cf9929222158530353eb",
+        "8c5f38a3079a4b9a46d6c33d0fcbd25f",
     ]