Skip to content

Commit

Permalink
WIP: partial updates, waiting for test CSV
Browse files Browse the repository at this point in the history
  • Loading branch information
ghukill committed Nov 25, 2024
1 parent 98a72a6 commit 6644b75
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 15 deletions.
56 changes: 47 additions & 9 deletions hrqb/tasks/libhr_employee_appointments.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
"""hrqb.tasks.libhr_employee_appointments"""
"""hrqb.tasks.libhr_employee_appointments
LibHR data is data known only to Library HR staff, and cannot be found in the data
warehouse. Other than some bulk inserts and updates performed by this task, this data
is mostly managed directly in Quickbase by Library HR staff. The data in the table
"LibHR Employee Appointments" is used to augment other Quickbase tables, and drives
some reports of its own.
"""

import datetime
import re

import luigi # type: ignore[import-untyped]
import numpy as np
import pandas as pd

from hrqb.base.task import (
PandasPickleTask,
QuickbaseUpsertTask,
)
from hrqb.utils import md5_hash_from_values
from hrqb.utils import convert_dataframe_columns_to_dates, md5_hash_from_values
from hrqb.utils.quickbase import QBClient


Expand All @@ -18,21 +27,48 @@ class ExtractLibHREmployeeAppointments(PandasPickleTask):
This task is expecting the CSV to be a local filepath. Unlike other pipelines in this
client, this pipeline is rarely run, and is suitable for local, developer runs to load
data.
Expected schema of CSV file:
- MIT ID: MIT ID
- HC ID: pattern of "L-###"
- Full Name (optional)
- Position (optional)
- Position ID: int
- Employee Type (optional)
- Supervisor ID: MIT ID
- Supervisor Name (optional)
- Cost Object
- Department: acronym
- Begin Date: begin date when Headcount ID (HC ID) applied
- End Date: end date when Headcount ID (HC ID) applied
"""

pipeline = luigi.Parameter()
stage = luigi.Parameter("Extract")
csv_filepath = luigi.Parameter()

def get_dataframe(self) -> pd.DataFrame:
# read CSV file
libhr_df = pd.read_csv(self.csv_filepath)

# convert 'Active' column to Quickbase Yes/No checkbox value
# np.False_ and np.True_ values are the result of Excel --> CSV --> pandas
libhr_df["Active"] = libhr_df["Active"].replace(
{np.True_: "Yes", np.False_: "No"}
# convert Begin and End dates and set "Active" column
libhr_df = convert_dataframe_columns_to_dates(
libhr_df, columns=["Begin Date", "End Date"]
)
libhr_df["Active"] = libhr_df["End Date"].apply(
lambda end_date: (
"Yes" if end_date >= datetime.datetime.now(tz=datetime.UTC) else "No"
)
)

# remove any suffixes from L-### headcount id, throwing an exception if absent
def remove_headcount_id_suffixes(original_headcount_id: str | None) -> str | None:
if original_headcount_id is None:
message = "LibHR CSV data is missing a Headcount ID for one or more rows."
raise ValueError(message)
matched_object = re.match(r"L-\d\d\d", original_headcount_id)
return matched_object.group(0) if matched_object else None

libhr_df["HC ID"] = libhr_df["HC ID"].apply(remove_headcount_id_suffixes)

return libhr_df

Expand Down Expand Up @@ -98,6 +134,8 @@ def get_dataframe(self) -> pd.DataFrame:
"Related Department ID": "Related Department ID",
"Active": "Active",
"Key": "Key",
"Begin Date": "Begin Date",
"End Date": "End Date",
}
return libhr_df[fields.keys()].rename(columns=fields)

Expand All @@ -109,7 +147,7 @@ class LoadLibHREmployeeAppointments(QuickbaseUpsertTask):

@property
def merge_field(self) -> str | None:
"""Explicitly merge on unique Position ID field."""
"""Explicitly merge on unique Key field."""
return "Key"

def requires(self) -> list[luigi.Task]: # pragma: nocover
Expand Down
10 changes: 6 additions & 4 deletions tests/fixtures/libhr_static_data.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Active
123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,True
987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,True
987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,False
MIT ID,HC ID,Full Name,Position,Position ID,Employee Type,Supervisor ID,Supervisor Name,Cost Object,Department,Begin Date,End Date,Development Notes
123456789,L-001,"Doe, John",Science Librarian,888888888,Admin Staff,444444444,"Smith, Fancy",555555555,DDC,2022-02-02,2999-12-31,
987654321,L-100,"Doe, Jane",Data Engineer,999999999,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,2021-01-01,2999-12-31,Represents employee Jane Doe getting a new headcount id when changing positions
987654321,L-101x,"Doe, Jane",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,BAD_ACRO,2020-01-01,2020-12-31,"Represents a headcount id that ended, and was picked up by a new employee."
987654320,L-101,"Doe, Alice",DevOps Engineer,999999991,Admin Staff,444444444,"Smith, Fancy",555555555,ITS,2021-06-01,2999-12-31,"Represents a headcount id that was picked up and is still active, but headcount id L-101 was vacant for six months."
343434343,L-050x,"Doe, Stan",Music Librarian,777777777,,333333333,"Lee, Pauline",444444444,IDLA,2023-07-15,2024-02-01,Represents a still vacant headcount id.
18 changes: 16 additions & 2 deletions tests/tasks/test_libhr_employee_appointments.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# ruff: noqa: PD901, PLR2004
import datetime

import pandas as pd

Expand All @@ -8,9 +9,20 @@ def test_extract_libhr_employee_appointments_read_csv(
):
df = task_extract_libhr_employee_appointments.get_dataframe()
assert isinstance(df, pd.DataFrame)
assert len(df) == 3
assert len(df) == 5
assert df.iloc[0]["MIT ID"] == 123456789
assert df.iloc[0]["Supervisor ID"] == 444444444
assert df.iloc[0]["Begin Date"] == datetime.datetime(2022, 2, 2, tzinfo=datetime.UTC)
assert df.iloc[0]["End Date"] == datetime.datetime(2999, 12, 31, tzinfo=datetime.UTC)
assert df.iloc[0]["Active"]


def test_extract_libhr_employee_appointments_strips_headcount_id_suffix(
task_extract_libhr_employee_appointments,
):
df = task_extract_libhr_employee_appointments.get_dataframe()
row = df.iloc[2]
assert row["HC ID"] == "L-101" # where original CSV has "L-101x"


def test_transform_libhr_employee_appointments_merge_departments(
Expand All @@ -35,5 +47,7 @@ def test_transform_libhr_employee_appointments_merge_field_key_values(
assert list(new_df["Key"]) == [
"81cf06bfd65aa1f7019750c57a79be99",
"6e07102ee39ec1f22c63231d090bd4dd",
"af08a24eeb35fae63fa76e755537b949",
"744aefdd46c40523d60cf69490d81655",
"4af9e7cd8c25cf9929222158530353eb",
"8c5f38a3079a4b9a46d6c33d0fcbd25f",
]

0 comments on commit 6644b75

Please sign in to comment.