diff --git a/hrqb/tasks/employee_appointments.py b/hrqb/tasks/employee_appointments.py index 664c86d..982b636 100644 --- a/hrqb/tasks/employee_appointments.py +++ b/hrqb/tasks/employee_appointments.py @@ -57,6 +57,9 @@ def get_dataframe(self) -> pd.DataFrame: libhr_df = self.named_inputs["ExtractQBLibHREmployeeAppointments"].read() depts_df = self.named_inputs["ExtractQBDepartments"].read() + # filter libhr data to active appointments, with position IDs + libhr_df = libhr_df[(libhr_df["Active"]) & ~(libhr_df["Position ID"].isna())] + # normalize position id to string and pad zeros libhr_df["Position ID"] = libhr_df["Position ID"].apply( lambda x: str(int(x)).zfill(8) diff --git a/hrqb/tasks/libhr_employee_appointments.py b/hrqb/tasks/libhr_employee_appointments.py index b77c34e..3796431 100644 --- a/hrqb/tasks/libhr_employee_appointments.py +++ b/hrqb/tasks/libhr_employee_appointments.py @@ -1,12 +1,14 @@ """hrqb.tasks.libhr_employee_appointments""" import luigi # type: ignore[import-untyped] +import numpy as np import pandas as pd from hrqb.base.task import ( PandasPickleTask, QuickbaseUpsertTask, ) +from hrqb.utils import md5_hash_from_values from hrqb.utils.quickbase import QBClient @@ -23,7 +25,16 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask): csv_filepath = luigi.Parameter() def get_dataframe(self) -> pd.DataFrame: - return pd.read_csv(self.csv_filepath) + # read CSV file + libhr_df = pd.read_csv(self.csv_filepath) + + # convert 'Active' column to Quickbase Yes/No checkbox value + # np.False_ and np.True_ values are the result of Excel --> CSV --> pandas + libhr_df["Active"] = libhr_df["Active"].replace( + {np.True_: "Yes", np.False_: "No"} + ) + + return libhr_df class ExtractQBDepartments(PandasPickleTask): @@ -67,6 +78,17 @@ def get_dataframe(self) -> pd.DataFrame: how="left", ) + # mint a unique, deterministic value for the merge "Key" field + libhr_df["Key"] = libhr_df.apply( + lambda row: md5_hash_from_values( + [ + str(row["MIT ID"]), + str(row["HC ID"]), + ] + ), + axis=1, + ) + fields = { "MIT ID": "Related Employee MIT ID", "Supervisor ID": "Related Supervisor MIT ID", @@ -74,6 +96,8 @@ def get_dataframe(self) -> pd.DataFrame: "HC ID": "HC ID", "Position ID": "Position ID", "Related Department ID": "Related Department ID", + "Active": "Active", + "Key": "Key", } return libhr_df[fields.keys()].rename(columns=fields) @@ -86,7 +110,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask): @property def merge_field(self) -> str | None: """Explicitly merge on unique Position ID field.""" - return "Position ID" + return "Key" def requires(self) -> list[luigi.Task]: # pragma: nocover return [ diff --git a/tests/conftest.py b/tests/conftest.py index ae1a0a3..542dca2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -653,6 +653,7 @@ def task_extract_qb_libhr_complete(all_tasks_pipeline_name): "Position ID": 987654321, "Cost Object": 7777777, "Related Department ID": 42.0, + "Active": True, } ] ) diff --git a/tests/fixtures/libhr_static_data.csv b/tests/fixtures/libhr_static_data.csv index 0f0556c..a8be8cc 100644 --- a/tests/fixtures/libhr_static_data.csv +++ b/tests/fixtures/libhr_static_data.csv @@ -1,4 +1,4 @@ -MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department -123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC -987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS -987654321,L-100,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO \ No newline at end of file +MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active +123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True +987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True +987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False \ No newline at end of file diff --git a/tests/tasks/test_libhr_employee_appointments.py b/tests/tasks/test_libhr_employee_appointments.py index f763eed..d441fc3 100644 --- a/tests/tasks/test_libhr_employee_appointments.py +++ b/tests/tasks/test_libhr_employee_appointments.py @@ -1,6 +1,5 @@ # ruff: noqa: PD901, PLR2004 -import numpy as np import pandas as pd @@ -18,39 +17,23 @@ def test_transform_libhr_employee_appointments_merge_departments( task_transform_libhr_employee_appointments, ): new_df = task_transform_libhr_employee_appointments.get_dataframe() - assert new_df.equals( - pd.DataFrame( - [ - { - "Related Employee MIT ID": 123456789, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-001", - "Position ID": 888888888, - "Related Department ID": 35.0, - }, - { - "Related Employee MIT ID": 987654321, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-100", - "Position ID": 999999999, - "Related Department ID": 40.0, - }, - { - "Related Employee MIT ID": 987654321, - "Related Supervisor MIT ID": 444444444, - "Cost Object": 555555555, - "HC ID": "L-100", - "Position ID": 999999991, - "Related Department ID": np.nan, - }, - ] - ) - ) + assert new_df.iloc[0]["Related Department ID"] == 35.0 + assert new_df.iloc[1]["Related Department ID"] == 40.0 + assert pd.isna(new_df.iloc[2]["Related Department ID"]) # no match in merge, so NULL def test_load_libhr_employee_appointments_merge_field_set( task_load_libhr_employee_appointments, ): - assert task_load_libhr_employee_appointments.merge_field == "Position ID" + assert task_load_libhr_employee_appointments.merge_field == "Key" + + +def test_transform_libhr_employee_appointments_merge_field_key_values( + task_transform_libhr_employee_appointments, +): + new_df = task_transform_libhr_employee_appointments.get_dataframe() + assert list(new_df["Key"]) == [ + "81cf06bfd65aa1f7019750c57a79be99", + "6e07102ee39ec1f22c63231d090bd4dd", + "af08a24eeb35fae63fa76e755537b949", + ]