From f8cbabb7d04224d274af07f5238fc92d6a88cc51 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 17 Sep 2024 16:11:43 -0400 Subject: [PATCH 1/2] Upate LibHR transform and load Why these changes are being introduced: A new requirement was added that table 'LibHR Employee Appointments' could have multiple rows for the same employee, but different headcount IDs (HC ID). This would represent previous positions they held. Related, this Quickbase table would need to support rows for old headcount IDs for employees, requiring a way to distinguish 'active' from 'inactive' rows. How this addresses that need: * Added new 'Active' column to Quickbase table 'LibHR Employee Appointments' * Add new 'Key' field to same table that is used for merge updates * this field is MD5 of MIT ID + Headcount ID (HC ID) Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/HRQB-49 --- hrqb/tasks/libhr_employee_appointments.py | 28 ++++++++++- tests/fixtures/libhr_static_data.csv | 8 ++-- .../tasks/test_libhr_employee_appointments.py | 47 ++++++------------- 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/hrqb/tasks/libhr_employee_appointments.py b/hrqb/tasks/libhr_employee_appointments.py index b77c34e..3796431 100644 --- a/hrqb/tasks/libhr_employee_appointments.py +++ b/hrqb/tasks/libhr_employee_appointments.py @@ -1,12 +1,14 @@ """hrqb.tasks.libhr_employee_appointments""" import luigi # type: ignore[import-untyped] +import numpy as np import pandas as pd from hrqb.base.task import ( PandasPickleTask, QuickbaseUpsertTask, ) +from hrqb.utils import md5_hash_from_values from hrqb.utils.quickbase import QBClient @@ -23,7 +25,16 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask): csv_filepath = luigi.Parameter() def get_dataframe(self) -> pd.DataFrame: - return pd.read_csv(self.csv_filepath) + # read CSV file + libhr_df = pd.read_csv(self.csv_filepath) + + # convert 'Active' column to Quickbase Yes/No checkbox value + # np.False_ and np.True_ values are the result of Excel --> CSV --> pandas + libhr_df["Active"] = libhr_df["Active"].replace( + {np.True_: "Yes", np.False_: "No"} + ) + + return libhr_df class ExtractQBDepartments(PandasPickleTask): @@ -67,6 +78,17 @@ def get_dataframe(self) -> pd.DataFrame: how="left", ) + # mint a unique, deterministic value for the merge "Key" field + libhr_df["Key"] = libhr_df.apply( + lambda row: md5_hash_from_values( + [ + str(row["MIT ID"]), + str(row["HC ID"]), + ] + ), + axis=1, + ) + fields = { "MIT ID": "Related Employee MIT ID", "Supervisor ID": "Related Supervisor MIT ID", @@ -74,6 +96,8 @@ def get_dataframe(self) -> pd.DataFrame: "HC ID": "HC ID", "Position ID": "Position ID", "Related Department ID": "Related Department ID", + "Active": "Active", + "Key": "Key", } return libhr_df[fields.keys()].rename(columns=fields) @@ -86,7 +110,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask): @property def merge_field(self) -> str | None: """Explicitly merge on unique Position ID field.""" - return "Position ID" + return "Key" def requires(self) -> list[luigi.Task]: # pragma: nocover return [ diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv index 0f0556c..7e73448 100644 --- a/tests/fixtures/libhr_static_data.csv +++ b/tests/fixtures/libhr_static_data.csv @@ -1,4 +1,4 @@ -MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department -123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC -987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS -987654321,L-100,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO \ No newline at end of file +MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active +123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True +987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True +987654321,L-101,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,True \ No newline at end of file diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py index f763eed..d3d16bf 100644 --- a/tests/tasks/test_libhr_employee_appointments.py +++ b/tests/tasks/test_libhr_employee_appointments.py @@ -1,6 +1,5 @@ # ruff: noqa: PD901, PLR2004 -import numpy as np import pandas as pd @@ -18,39 +17,23 @@ def test_transform_libhr_employee_appointments_merge_departments( task_transform_libhr_employee_appointments, ): new_df = task_transform_libhr_employee_appointments.get_dataframe() - assert new_df.equals( - pd.DataFrame( - [ - { - "Related Employee MIT ID": 123456789, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-001", - "Position ID": 888888888, - "Related Department ID": 35.0, - }, - { - "Related Employee MIT ID": 987654321, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-100", - "Position ID": 999999999, - "Related Department ID": 40.0, - }, - { - "Related Employee MIT ID": 987654321, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-100", - "Position ID": 999999991, - "Related Department ID": np.nan, - }, - ] - ) - ) + assert new_df.iloc[0]["Related Department ID"] == 35.0 + assert new_df.iloc[1]["Related Department ID"] == 40.0 + assert pd.isna(new_df.iloc[2]["Related Department ID"]) # no match in merge, so NULL def test_load_libhr_employee_appointments_merge_field_set( task_load_libhr_employee_appointments, ): - assert task_load_libhr_employee_appointments.merge_field == "Position ID" + assert task_load_libhr_employee_appointments.merge_field == "Key" + + +def test_transform_libhr_employee_appointments_merge_field_key_values( + task_transform_libhr_employee_appointments, +): + new_df = task_transform_libhr_employee_appointments.get_dataframe() + assert list(new_df["Key"]) == [ + "81cf06bfd65aa1f7019750c57a79be99", + "6e07102ee39ec1f22c63231d090bd4dd", + "744aefdd46c40523d60cf69490d81655", + ] From 6364f993104eb71cbcff1953526cfd9737af2ceb Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Wed, 18 Sep 2024 10:01:58 -0400 Subject: [PATCH 2/2] Employee Appointments join on active LibHR Data Why these changes are being introduced: Now that the Quickbase table 'LibHR Employee Appointments' has historical data (headcount ids for previous employee appointments), it is important that Employee Appointments only join on 'active' LibHR data. How this addresses that need: * For task TransformEmployeeAppointments, only join on active LibHR Employee Appointment rows * Update fixture to simulate inactive/historical data in LibHR data Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/HRQB-49 --- hrqb/tasks/employee_appointments.py | 3 +++ tests/conftest.py | 1 + tests/fixtures/libhr_static_data.csv | 2 +- tests/tasks/test_libhr_employee_appointments.py | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hrqb/tasks/employee_appointments.py b/hrqb/tasks/employee_appointments.py index 664c86d..982b636 100644 --- a/hrqb/tasks/employee_appointments.py +++ b/hrqb/tasks/employee_appointments.py @@ -57,6 +57,9 @@ def get_dataframe(self) -> pd.DataFrame: libhr_df = self.named_inputs["ExtractQBLibHREmployeeAppointments"].read() depts_df = self.named_inputs["ExtractQBDepartments"].read() + # filter libhr data to active appointments, with position IDs + libhr_df = libhr_df[(libhr_df["Active"]) & ~(libhr_df["Position ID"].isna())] + # normalize position id to string and pad zeros libhr_df["Position ID"] = libhr_df["Position ID"].apply( lambda x: str(int(x)).zfill(8) diff --git a/tests/conftest.py b/tests/conftest.py index ae1a0a3..542dca2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -653,6 +653,7 @@ def task_extract_qb_libhr_complete(all_tasks_pipeline_name): "Position ID": 987654321, "Cost Object": 7777777, "Related Department ID": 42.0, + "Active": True, } ] ) diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv index 7e73448..a8be8cc 100644 --- a/tests/fixtures/libhr_static_data.csv +++ b/tests/fixtures/libhr_static_data.csv @@ -1,4 +1,4 @@ MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active 123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True 987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True -987654321,L-101,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,True \ No newline at end of file +987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False \ No newline at end of file diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py index d3d16bf..d441fc3 100644 --- a/tests/tasks/test_libhr_employee_appointments.py +++ b/tests/tasks/test_libhr_employee_appointments.py @@ -35,5 +35,5 @@ def test_transform_libhr_employee_appointments_merge_field_key_values( assert list(new_df["Key"]) == [ "81cf06bfd65aa1f7019750c57a79be99", "6e07102ee39ec1f22c63231d090bd4dd", - "744aefdd46c40523d60cf69490d81655", + "af08a24eeb35fae63fa76e755537b949", ]