From dc77521fb71b0024f9b7d8db00c5f3e509f3992d Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 15 Apr 2024 16:22:12 -0400 Subject: [PATCH] Add documentation on everything --- Dockerfile | 59 +++-- IIT-Prediction/version.md | 197 +++++++------- README.md | 7 +- SQL/iit_prod_data_extract.sql | 248 +++++++++--------- docker-resources/config.example.yml | 2 + docker-resources/crontab | 2 + docker-resources/docker-entrypoint.sh | 1 + docker-resources/plumber.R | 44 +++- .../run_daily_stored_procedures.sh | 2 + docker-resources/run_predictions.sh | 2 + 10 files changed, 321 insertions(+), 243 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4600b7a..cc1317f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,48 +2,71 @@ FROM rstudio/plumber:latest ENV TZ "Africa/Nairobi" +# install base libraries we need RUN apt-get -y update -qq && apt-get -y --no-install-recommends install \ - tini \ - libmariadb-dev \ - libmysqlclient21 \ - openjdk-8-jdk-headless \ - cron \ - curl \ - && rm -rf /var/lib/apt/lists/* \ - && rm -rf /etc/cron.*/* + tini \ + libmariadb-dev \ + libmysqlclient21 \ + openjdk-8-jdk-headless \ + cron \ + curl \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf /etc/cron.*/* +# install the R packages we need +# for these, the latest version should always be usable RUN install2.r --error --skipinstalled \ - tidyverse \ - pool \ - clock \ - config \ - uuid \ - readr \ - RMariaDB \ - DBI \ - && rm -rf /tmp/downloaded_packages + tidyverse \ + pool \ + clock \ + config \ + uuid \ + readr \ + RMariaDB \ + DBI \ + && rm -rf /tmp/downloaded_packages +# The model always needs to run on the exact version of h2o used to train it RUN Rscript -e "remotes::install_version('h2o', '3.42.0.2')" +# Add the prediction model to the app COPY IIT-Prediction/model/V7 /app/model +# Add the production extraction query to the app COPY SQL/iit_prod_data_extract.sql /app/iit_prod_data_extract.sql + +# The next scripts are used for cron jobs +# this script triggers the predictions to run by hitting the API endpoint COPY docker-resources/run_predictions.sh /app/run_predictions.sh RUN chmod 0744 /app/run_predictions.sh +# this script runs a small number of stored procedures we depend on to update +# various tables used in the predictions COPY docker-resources/run_daily_stored_procedures.sh /app/run_daily_stored_procedures.sh RUN chmod 0744 /app/run_daily_stored_procedures.sh + +# this is a Docker entrypoint script +# it ensures the cron daemon is started and then runs the API COPY docker-resources/docker-entrypoint.sh /docker-entrypoint.sh RUN chmod 0744 /docker-entrypoint.sh +# here we actually setup the cron jobs, using our source crontab +# cron is _very_ picky, so it may not be best to mess with this COPY docker-resources/crontab /etc/cron.d/iit-crontab RUN chmod 0644 /etc/cron.d/iit-crontab RUN crontab -u root /etc/cron.d/iit-crontab -RUN touch /var/log/cron.log +# now we also need to add the R code used here +# this R code actually runs the stored procedures for run_daily_stored_procedures.sh +# this is done in R so we can re-use the database settings for the API COPY docker-resources/dailyStoredProcedures.R /app/dailyStoredProcedures.R +# plumber.R is the main app COPY docker-resources/plumber.R /app/plumber.R +# EXPOSE is just documentation; by default, the API is run on port 8000 +# In production, this port is not exposed, as we hit the API from inside the container EXPOSE 8000 +# setup the entrypoint ENTRYPOINT ["tini", "--", "/docker-entrypoint.sh"] +# this may not be necessary, but its left in to match the parent container defaults CMD ["/app/plumber.R"] diff --git a/IIT-Prediction/version.md b/IIT-Prediction/version.md index bc1e00f..10713a9 100644 --- a/IIT-Prediction/version.md +++ b/IIT-Prediction/version.md @@ -1,4 +1,3 @@ - # Premise This file is used to keep track of model version metadata @@ -15,22 +14,20 @@ TODO: TODO: - ## V4 TODO: - ## V5 TODO: ## V6 -Version 6 of the model is trained using a larger dataset (All AMPATH care data) and more recent dataset (Anyone with an encounter after 2021): +Version 6 of the model is trained using a larger dataset (All AMPATH care data) and more recent dataset (Anyone with an encounter after 2021): -* "Cohort 2021 Patients: 95101" -* "Cohort 2021 Visits: 859184" +- "Cohort 2021 Patients: 95101" +- "Cohort 2021 Visits: 859184" Some predictors have been added while others have been removed. Here is a list of the new predictors used to train the model @@ -39,56 +36,55 @@ Some predictors have been added while others have been removed. Here is a list o HIV_disclosure_stage = if_else(is.na(hiv_disclosure_status_value),"Not Done",hiv_disclosure_status_value), Clinic_County=Clinic_County, Clinic_Name =Clinic_Name, - Program_Name = if_else(is.na(Program_Name),"Unknown",Program_Name), + Program_Name = if_else(is.na(Program_Name),"Unknown",Program_Name), # New Vars TB_screening = tb_screen, - TB_Test_Result =factor(tb_test_result), + TB_Test_Result =factor(tb_test_result), On_TB_TX = on_tb_tx, On_IPT = on_ipt, CA_CX_Screening =if_else(is.na(ca_cx_screening),0,ca_cx_screening), CA_CX_Screening_Result = factor(if_else(is.na(ca_cx_screening_result),1118,ca_cx_screening_result)) ``` - Also here is a list of all predictors: ``` X=c( - - c( 'Age','Age_NA', - 'Gender' , - 'Duration_in_HIV_care', 'Duration_in_HIV_care_NA', + + c( 'Age','Age_NA', + 'Gender' , + 'Duration_in_HIV_care', 'Duration_in_HIV_care_NA', 'BMI', 'BMI_NA', - #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', + #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', 'WHO_staging','WHO_staging_NA', 'Viral_Load_log10', 'Viral_Load_log10_NA', 'VL_suppression', 'Days_Since_Last_VL', - 'HIV_disclosure','HIV_disclosure_NA', - 'Regimen_Line', 'Regimen_Line_NA', + 'HIV_disclosure','HIV_disclosure_NA', + 'Regimen_Line', 'Regimen_Line_NA', 'Pregnancy', 'CD4','CD4_NA', 'Days_Since_Last_CD4', "Encounter_Type_Class", 'ART_regimen', - 'Visit_Number', + 'Visit_Number', 'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA', 'num_2wks_defaults_last_3visits', 'num_2wks_defaults_last_3visits_NA', 'ever_defaulted_by_1m_in_last_1year','ever_defaulted_by_1m_in_last_1year_NA', 'ever_defaulted_by_1m_in_last_2year','ever_defaulted_by_1m_in_last_2year_NA', - + # Baseline 'Age_baseline', - 'Gender_baseline' , + 'Gender_baseline' , 'BMI_baseline', 'WHO_staging_baseline', - 'VL_suppression_baseline', + 'VL_suppression_baseline', 'Viral_Load_log10_baseline', 'HIV_disclosure_baseline', - 'Regimen_Line_baseline', + 'Regimen_Line_baseline', 'Pregnancy_baseline', 'CD4_baseline', - "Clinic_Name_baseline", + "Clinic_Name_baseline", 'ART_regimen_baseline', - + # New Vars 'ART_Adherence', 'HIV_disclosure_stage', @@ -96,35 +92,32 @@ X=c( 'Clinic_Name', 'Program_Name', 'TB_screening', - 'TB_Test_Result', + 'TB_Test_Result', 'On_TB_TX', 'On_IPT', 'CA_CX_Screening', 'CA_CX_Screening_Result' - - - + + + ) - - + + ) ``` - ### Model to use? 2_StackedEnsemble_BestOfFamily_1_AutoML_8_20230726_142520_auc_0.704 - ## V7 - Version 7 of the model is trained using 2 cohorts of datasets: -* Adult 88,809 (93.383876%) -* Minor 6,292 (6.616124%) +- Adult 88,809 (93.383876%) +- Minor 6,292 (6.616124%) With these changes, the cross-validated AUC has increased from ~70 to ~77 @@ -141,7 +134,6 @@ Here is a list of the new predictors that have been added 'num_1month_defaults_last_3visits' ``` - Please see the util files on how these variables are define ``` @@ -149,18 +141,17 @@ Please see the util files on how these variables are define num_1day_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1day_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)), num_1day_defaults_last_3visits = if_else(is.na(num_1day_defaults_last_3visits), 0, num_1day_defaults_last_3visits), - + num_7days_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-7days_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)), num_7days_defaults_last_3visits = if_else(is.na(num_7days_defaults_last_3visits), 0, num_7days_defaults_last_3visits), - + num_1month_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1month_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)), num_1month_defaults_last_3visits = if_else(is.na(num_1month_defaults_last_3visits), 0, num_1month_defaults_last_3visits), - + ``` ### Removed predictors - Here is a list of the old predictors that have been removed ``` @@ -168,8 +159,6 @@ Here is a list of the old predictors that have been removed 'num_2wks_defaults_last_3visits_NA' ``` - - ### All predictors Finally here is a list of all predictors: @@ -178,72 +167,71 @@ Finally here is a list of all predictors: X=c( - - c( 'Age','Age_NA', - 'Gender' , - 'Duration_in_HIV_care', 'Duration_in_HIV_care_NA', + + c( 'Age','Age_NA', + 'Gender' , + 'Duration_in_HIV_care', 'Duration_in_HIV_care_NA', 'BMI', 'BMI_NA', - #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', + #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', 'WHO_staging','WHO_staging_NA', - 'Viral_Load_log10', 'Viral_Load_log10_NA', # REMOVED (V7) 'VL_suppression', + 'Viral_Load_log10', 'Viral_Load_log10_NA', # REMOVED (V7) 'VL_suppression', 'Days_Since_Last_VL', - 'HIV_disclosure','HIV_disclosure_NA', - 'Regimen_Line', 'Regimen_Line_NA', + 'HIV_disclosure','HIV_disclosure_NA', + 'Regimen_Line', 'Regimen_Line_NA', 'Pregnancy', 'CD4','CD4_NA', 'Days_Since_Last_CD4', "Encounter_Type_Class", 'ART_regimen', - 'Visit_Number', + 'Visit_Number', 'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA', 'num_2wks_defaults_last_3visits', # REMOVED (V7) 'num_2wks_defaults_last_3visits_NA', 'ever_defaulted_by_1m_in_last_1year','ever_defaulted_by_1m_in_last_1year_NA', 'ever_defaulted_by_1m_in_last_2year','ever_defaulted_by_1m_in_last_2year_NA', - + # Baseline 'Age_baseline', - 'Gender_baseline' , + 'Gender_baseline' , 'BMI_baseline', 'WHO_staging_baseline', - 'VL_suppression_baseline', + 'VL_suppression_baseline', 'Viral_Load_log10_baseline', 'HIV_disclosure_baseline', - 'Regimen_Line_baseline', + 'Regimen_Line_baseline', 'Pregnancy_baseline', 'CD4_baseline', - "Clinic_Name_baseline", + "Clinic_Name_baseline", 'ART_regimen_baseline', - + # New Vars (V6) 'ART_Adherence', 'HIV_disclosure_stage', 'Clinic_County', 'Clinic_Name', - 'Program_Name', + 'Program_Name', 'TB_screening', - 'TB_Test_Result', + 'TB_Test_Result', 'On_TB_TX', 'On_IPT', 'CA_CX_Screening', 'CA_CX_Screening_Result', - + # New Var (V7) 'Month', 'num_1day_defaults_last_3visits', 'num_7days_defaults_last_3visits', 'num_1month_defaults_last_3visits' - - - - + + + + ) - - + + ) ``` - ### Model to use? #### Adult Model @@ -253,11 +241,10 @@ IIT-Prediction/model/V7/y0_1days_adult_IIT/1_StackedEnsemble_BestOfFamily_1_Auto Note: Please remember to factorize all character predictors before scoring ``` -clean.df= clean.long.df %>% - mutate_if(is.character, as.factor) +clean.df= clean.long.df %>% + mutate_if(is.character, as.factor) ``` - ### Minor Model IIT-Prediction/model/V7/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957_auc_0.734 @@ -265,28 +252,53 @@ IIT-Prediction/model/V7/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoM Note: Please remember to factorize all character predictors before scoring as shown below ``` -clean.df= clean.long.df %>% - mutate_if(is.character, as.factor) +clean.df= clean.long.df %>% + mutate_if(is.character, as.factor) ``` ### Monitoring Please save logs especially warning logs which we can use to track any drift in concept or bad variables. +## V8 -## V9 +### Removed Predictors +``` + 'TB_screening', + 'On_TB_TX', + 'On_IPT', + 'CA_CX_Screening', + 'CA_CX_Screening_Result', + 'num_2wks_defaults_last_3visits', + 'ever_defaulted_by_1m_in_last_1year', + 'ever_defaulted_by_1m_in_last_1year_NA', + 'ever_defaulted_by_1m_in_last_2year', + 'ever_defaulted_by_1m_in_last_2year_NA', + 'Age_baseline', + 'Gender_baseline' , + 'BMI_baseline', + 'WHO_staging_baseline', + 'VL_suppression_baseline', + 'Viral_Load_log10_baseline', + 'HIV_disclosure_baseline', + 'Regimen_Line_baseline', + 'Pregnancy_baseline', + 'CD4_baseline', + "Clinic_Name_baseline", + 'ART_regimen_baseline', +``` +## V9 Version 9 of the model is trained using 2 cohorts of datasets: -* Adult - up to 04-04-2024 -* Minor - up to 04-04-2024 +- Adult - up to 04-04-2024 +- Minor - up to 04-04-2024 Facility level predictors have been added (please see the csv shared via drive) We have simplified the model by removing some predictors - ### New predictors Here is a list of the new predictors that have been added @@ -311,11 +323,8 @@ Here is a list of the new predictors that have been added 'Facility Type' ``` - - ### Removed predictors - Here is a list of the old predictors that have been removed ``` @@ -324,8 +333,6 @@ Here is a list of the old predictors that have been removed "Clinic_Name_baseline", # Removed in V9 ``` - - ### All predictors Finally here is a list of all predictors: @@ -334,11 +341,11 @@ Finally here is a list of all predictors: X=c( - - c( - - 'Age','Age_NA', - 'Gender' , + + c( + + 'Age','Age_NA', + 'Gender' , 'num_1day_defaults_last_3visits', 'Current_Clinic_County', 'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA', @@ -348,42 +355,40 @@ X=c( 'Days_Since_Last_VL', 'Days_Since_Last_VL_NA', 'Visit_Number', 'HIV_disclosure_stage', 'HIV_disclosure_baseline_NA', 'Program_Name', 'Days_Since_Last_CD4', 'Days_Since_Last_CD4_NA', - 'Month', 'TB_Test_Result', + 'Month', 'TB_Test_Result', 'Viral_Load_log10', 'Viral_Load_log10_NA', 'BMI', 'BMI_NA', 'CD4','CD4_NA', 'Facility Type' ) - - + + ) ``` - ### Model to use? #### Adult Model -IIT-Prediction/model/V9/y0_1days_adult_IIT/1_StackedEnsemble_... +IIT-Prediction/model/V9/y0*1days_adult_IIT/1_StackedEnsemble*... Note: Please remember to factorize all character predictors before scoring ``` -clean.df= clean.long.df %>% - mutate_if(is.character, as.factor) +clean.df= clean.long.df %>% + mutate_if(is.character, as.factor) ``` - ### Minor Model -IIT-Prediction/model/V9/y0_1day_minor_IIT/1_StackedEnsemble_... +IIT-Prediction/model/V9/y0*1day_minor_IIT/1_StackedEnsemble*... Note: Please remember to factorize all character predictors before scoring as shown below ``` -clean.df= clean.long.df %>% - mutate_if(is.character, as.factor) +clean.df= clean.long.df %>% + mutate_if(is.character, as.factor) ``` ### Monitoring diff --git a/README.md b/README.md index 849a3b8..ba64019 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,14 @@ # clinical-prediction-ml-models -The main objective of this project is to predict clinical outcomes using EHR Data and modern ML/AI -* IIT-Prediction - Start [Here](IIT-Prediction/README.md) +The main objective of this project is to predict clinical outcomes using EHR Data and modern ML/AI + +- IIT-Prediction - Start [Here](IIT-Prediction/README.md) +- Technical Documentation - Start [Here](readme/technical_documentation.md) ## Docker Image To build the Docker image, first create a `config.yml` using the file in the `docker-resources` folder called `config.example.yml` as a guide. Then build the Docker image: - ``` docker build --tag ampath-iit-prediction-model-: . ``` diff --git a/SQL/iit_prod_data_extract.sql b/SQL/iit_prod_data_extract.sql index cecc468..71588a3 100644 --- a/SQL/iit_prod_data_extract.sql +++ b/SQL/iit_prod_data_extract.sql @@ -23,84 +23,84 @@ with num_1day_defaults_last_3_visits as ( on dd3.person_id = dd2.person_id and dd3.visit_number = dd2.visit_number - 1 ), - num_7day_defaults_last_3_visits as ( - select - dd1.person_id, - dd1.encounter_id, - dd1.visit_number, - case - when dd1.days_defaulted_last_encounter is null or - dd2.days_defaulted_last_encounter is null or - dd3.days_defaulted_last_encounter is null - then null - else - if(dd1.days_defaulted_last_encounter >= 7, 1, 0) + - if(dd2.days_defaulted_last_encounter >= 7, 1, 0) + - if(dd3.days_defaulted_last_encounter >= 7, 1, 0) - end as num_7day_defaults_last_3_visits - from predictions.flat_ml_days_defaulted dd1 - left join predictions.flat_ml_days_defaulted dd2 - on dd2.person_id = dd1.person_id - and dd2.visit_number = dd1.visit_number - 1 - left join predictions.flat_ml_days_defaulted dd3 - on dd3.person_id = dd2.person_id - and dd3.visit_number = dd2.visit_number - 1 - ), - num_2wk_defaults_last_3_visits as ( - select - dd1.person_id, - dd1.encounter_id, - dd1.visit_number, - case - when dd1.days_defaulted_last_encounter is null or - dd2.days_defaulted_last_encounter is null or - dd3.days_defaulted_last_encounter is null - then null - else - if(dd1.days_defaulted_last_encounter >= 14, 1, 0) + - if(dd2.days_defaulted_last_encounter >= 14, 1, 0) + - if(dd3.days_defaulted_last_encounter >= 14, 1, 0) - end as num_2wks_defaults_last_3visits - from predictions.flat_ml_days_defaulted dd1 - left join predictions.flat_ml_days_defaulted dd2 - on dd2.person_id = dd1.person_id - and dd2.visit_number = dd1.visit_number - 1 - left join predictions.flat_ml_days_defaulted dd3 - on dd3.person_id = dd2.person_id - and dd3.visit_number = dd2.visit_number - 1 - ), - num_1month_defaults_last_3_visits as ( - select - dd1.person_id, - dd1.encounter_id, - dd1.visit_number, - case - when dd1.days_defaulted_last_encounter is null or - dd2.days_defaulted_last_encounter is null or - dd3.days_defaulted_last_encounter is null - then null - else - if(dd1.days_defaulted_last_encounter >= 30, 1, 0) + - if(dd2.days_defaulted_last_encounter >= 30, 1, 0) + - if(dd3.days_defaulted_last_encounter >= 30, 1, 0) - end as num_1month_defaults_last_3_visits - from predictions.flat_ml_days_defaulted dd1 - left join predictions.flat_ml_days_defaulted dd2 - on dd2.person_id = dd1.person_id - and dd2.visit_number = dd1.visit_number - 1 - left join predictions.flat_ml_days_defaulted dd3 - on dd3.person_id = dd2.person_id - and dd3.visit_number = dd2.visit_number - 1 - ), - defaults_by_days as ( - select - dd.person_id, - dd.encounter_id, - encounter_date, - max(dd.days_defaulted_last_encounter) as days_defaulted - from predictions.flat_ml_days_defaulted dd - group by dd.person_id, encounter_date - ) +num_7day_defaults_last_3_visits as ( + select + dd1.person_id, + dd1.encounter_id, + dd1.visit_number, + case + when dd1.days_defaulted_last_encounter is null or + dd2.days_defaulted_last_encounter is null or + dd3.days_defaulted_last_encounter is null + then null + else + if(dd1.days_defaulted_last_encounter >= 7, 1, 0) + + if(dd2.days_defaulted_last_encounter >= 7, 1, 0) + + if(dd3.days_defaulted_last_encounter >= 7, 1, 0) + end as num_7day_defaults_last_3_visits + from predictions.flat_ml_days_defaulted dd1 + left join predictions.flat_ml_days_defaulted dd2 + on dd2.person_id = dd1.person_id + and dd2.visit_number = dd1.visit_number - 1 + left join predictions.flat_ml_days_defaulted dd3 + on dd3.person_id = dd2.person_id + and dd3.visit_number = dd2.visit_number - 1 +), +num_2wk_defaults_last_3_visits as ( + select + dd1.person_id, + dd1.encounter_id, + dd1.visit_number, + case + when dd1.days_defaulted_last_encounter is null or + dd2.days_defaulted_last_encounter is null or + dd3.days_defaulted_last_encounter is null + then null + else + if(dd1.days_defaulted_last_encounter >= 14, 1, 0) + + if(dd2.days_defaulted_last_encounter >= 14, 1, 0) + + if(dd3.days_defaulted_last_encounter >= 14, 1, 0) + end as num_2wks_defaults_last_3visits + from predictions.flat_ml_days_defaulted dd1 + left join predictions.flat_ml_days_defaulted dd2 + on dd2.person_id = dd1.person_id + and dd2.visit_number = dd1.visit_number - 1 + left join predictions.flat_ml_days_defaulted dd3 + on dd3.person_id = dd2.person_id + and dd3.visit_number = dd2.visit_number - 1 +), +num_1month_defaults_last_3_visits as ( + select + dd1.person_id, + dd1.encounter_id, + dd1.visit_number, + case + when dd1.days_defaulted_last_encounter is null or + dd2.days_defaulted_last_encounter is null or + dd3.days_defaulted_last_encounter is null + then null + else + if(dd1.days_defaulted_last_encounter >= 30, 1, 0) + + if(dd2.days_defaulted_last_encounter >= 30, 1, 0) + + if(dd3.days_defaulted_last_encounter >= 30, 1, 0) + end as num_1month_defaults_last_3_visits + from predictions.flat_ml_days_defaulted dd1 + left join predictions.flat_ml_days_defaulted dd2 + on dd2.person_id = dd1.person_id + and dd2.visit_number = dd1.visit_number - 1 + left join predictions.flat_ml_days_defaulted dd3 + on dd3.person_id = dd2.person_id + and dd3.visit_number = dd2.visit_number - 1 +), +defaults_by_days as ( + select + dd.person_id, + dd.encounter_id, + encounter_date, + max(dd.days_defaulted_last_encounter) as days_defaulted + from predictions.flat_ml_days_defaulted dd + group by dd.person_id, encounter_date +) -- describe the columns we need select fs.person_id, @@ -113,7 +113,7 @@ select p.gender as Gender, null as Marital_status, timestampdiff(year, - if(year(fs.arv_first_regimen_start_date) != 1900, + if(year(fs.arv_first_regimen_start_date) != 1900, -- 1900 indicates junk data date(fs.arv_first_regimen_start_date), null ), @@ -121,6 +121,8 @@ select ) as Duration_in_HIV_care, if(fs.arv_first_regimen_start_date is null or year(fs.arv_first_regimen_start_date) = 1900, 1, 0) as Duration_in_HIV_care_NA, + -- BMI = wt / (ht / 100)^2 + -- BMI < 5.0 or over 60.0 are considered errors, usually errors in the underlying data case when fs.weight is null or fs.height is null or fs.weight < 1 or fs.height < 1 then null @@ -148,10 +150,11 @@ select timestampdiff(DAY, fs.encounter_datetime, fs.vl_resulted_date) as Days_Since_Last_VL, fs.hiv_status_disclosed as HIV_disclosure, if(fs.hiv_status_disclosed is null, 1, 0) as HIV_disclosure_NA, - -- NB Regimen Line differs from extraction data + -- Regimen Line data frequently differs forom the training data fs.cur_arv_line as Regimen_Line, if(fs.cur_arv_line is null, 1, 0) as Regimen_Line_NA, coalesce(fs.is_pregnant, 0) as Pregnancy, + -- manual look-up table for site characteristics case when fs.location_id in ( -- Dumisha @@ -178,12 +181,15 @@ select when et.name in ('ADULTINITIAL', 'PEDSINITIAL', 'YOUTHINITIAL') then 'Initial' when et.name in ('ADULTRETURN', 'PEDSRETURN', 'YOUTHRETURN') then 'Return' else 'Other' - end as Encounter_Type_Class, + end as Encounter_Type_Class, null as Education_Level, null as Occupation, null as Adherence_Counselling_Sessions, l.name as Clinic_Name, replace(etl.get_arv_names(fs.cur_arv_meds), '##', '+') as ART_regimen, + -- flat_hiv_summary has a visit_number value, but its a total counter + -- the model is trained on data from 2021, so we recalculate the visit number from the + -- default data dd.visit_number as Visit_Number, days_defaulted_last_encounter as Days_defaulted_in_prev_enc, if(days_defaulted_last_encounter is null, 1, 0) as Days_defaulted_in_prev_enc_NA, @@ -231,53 +237,55 @@ select fs.ca_cx_screening_result as CA_CX_Screening_Result, convert(month(date(fs.rtc_date)), char) as 'Month' from etl.flat_hiv_summary_v15b as fs - left join predictions.flat_ml_baseline_visit baseline - on fs.person_id = baseline.person_id - left join predictions.flat_ml_days_defaulted dd - on dd.encounter_id = fs.encounter_id - and dd.person_id = fs.person_id - join amrs.person p on p.person_id = fs.person_id - left join amrs.encounter_type et on fs.encounter_type = et.encounter_type_id - left join amrs.location l - on fs.location_id = l.location_id - and l.retired = 0 - -- If a patient in enrolled in PMTCT, they are also enrolled in antenatal care - -- Currently, we only keep the PMTCT record - left join etl.program_visit_map pvm - on pvm.visit_type_id = fs.visit_type - and pvm.voided is null - and (pvm.program_type_id != 42 or pvm.visit_type_id not in (51, 54)) - and (pvm.program_type_id != 52 or pvm.visit_type_id not in (1, 2)) - left join amrs.program program - on pvm.program_type_id = program.program_id - and program.retired = 0 - left join num_1day_defaults_last_3_visits 1day_defaults - on 1day_defaults.person_id = fs.person_id - and 1day_defaults.encounter_id = fs.encounter_id - left join num_7day_defaults_last_3_visits 7day_defaults - on 7day_defaults.person_id = fs.person_id - and 7day_defaults.encounter_id = fs.encounter_id - left join num_2wk_defaults_last_3_visits 2wk_defaults - on 2wk_defaults.person_id = fs.person_id - and 2wk_defaults.encounter_id = fs.encounter_id - left join num_1month_defaults_last_3_visits 1month_defaults - on 1month_defaults.person_id = fs.person_id - and 1month_defaults.encounter_id = fs.encounter_id - left join ( + left join predictions.flat_ml_baseline_visit baseline + on fs.person_id = baseline.person_id + left join predictions.flat_ml_days_defaulted dd + on dd.encounter_id = fs.encounter_id + and dd.person_id = fs.person_id + join amrs.person p on p.person_id = fs.person_id + left join amrs.encounter_type et on fs.encounter_type = et.encounter_type_id + left join amrs.location l + on fs.location_id = l.location_id + and l.retired = 0 + -- If a patient in enrolled in PMTCT, they are also enrolled in antenatal care + -- Currently, we only keep the PMTCT record + left join etl.program_visit_map pvm + on pvm.visit_type_id = fs.visit_type + and pvm.voided is null + and (pvm.program_type_id != 42 or pvm.visit_type_id not in (51, 54)) + and (pvm.program_type_id != 52 or pvm.visit_type_id not in (1, 2)) + left join amrs.program program + on pvm.program_type_id = program.program_id + and program.retired = 0 + left join num_1day_defaults_last_3_visits 1day_defaults + on 1day_defaults.person_id = fs.person_id + and 1day_defaults.encounter_id = fs.encounter_id + left join num_7day_defaults_last_3_visits 7day_defaults + on 7day_defaults.person_id = fs.person_id + and 7day_defaults.encounter_id = fs.encounter_id + left join num_2wk_defaults_last_3_visits 2wk_defaults + on 2wk_defaults.person_id = fs.person_id + and 2wk_defaults.encounter_id = fs.encounter_id + left join num_1month_defaults_last_3_visits 1month_defaults + on 1month_defaults.person_id = fs.person_id + and 1month_defaults.encounter_id = fs.encounter_id + left join ( select person_id, if(days_defaulted >= 30, 1, 0) as any_30d_defaults_1yr from defaults_by_days where encounter_date between date_sub(?startDate, interval 1 year) and ?startDate group by person_id ) as 1yr on 1yr.person_id = fs.person_id - left join ( + left join ( select person_id, if(days_defaulted >= 30, 1, 0) as any_30d_defaults_2yr from defaults_by_days where encounter_date between date_sub(?startDate, interval 2 year) and ?startDate group by person_id ) as 2yr on 2yr.person_id = fs.person_id - left join predictions.ml_weekly_predictions mlp - on mlp.encounter_id = fs.encounter_id -where fs.location_id in ( + left join predictions.ml_weekly_predictions mlp + on mlp.encounter_id = fs.encounter_id +where + -- filter to only targetted locations + fs.location_id in ( -- Dumisha 26, 23, 319, 130, 313, 9, 78, 310, 20, 312, 12, 321, 8, 341, 342, 65, 314, 64, 83, 90, 106, 86, 336, 91, 320, 74, 76, 79, 100, 311, 75, 195, 19, 230, -- Uzima @@ -285,8 +293,6 @@ where fs.location_id in ( -- April 2024 rollout (NB some are included above - 420, 421, 422, & 423) 211, 60, 323, 140, 4, 322, 351, 352, 208, 69, 208, 11, 229 ) - -- test locations and (21 - Non AMPATH Site, 22 - None) - and fs.location_id not in (21, 22, 429, 430, 354) -- filter encounters: 111 - LabResult, 99999 - lab encounter type -- these encounters are post-visit lab result entries and should not appear in predicted data and fs.encounter_type not in (111, 99999) @@ -297,11 +303,13 @@ where fs.location_id in ( -- returned to normal status at whatever clinic they visit and (fs.transfer_in_location_id is null or fs.transfer_in_location_id != 9999) and fs.is_clinical_encounter = 1 + -- substituted from the R script and fs.rtc_date between ?startDate and ?endDate and (fs.next_clinical_datetime_hiv is null or (?retrospective and fs.next_clinical_datetime_hiv >= fs.rtc_date) ) and fs.encounter_datetime < fs.date_created + -- filter dead patients and fs.death_date is null -- if not run retrospectively, don't generate new predictions for existing cases and (?retrospective or mlp.encounter_id is null); diff --git a/docker-resources/config.example.yml b/docker-resources/config.example.yml index 006edec..4d4f2f9 100644 --- a/docker-resources/config.example.yml +++ b/docker-resources/config.example.yml @@ -1,3 +1,5 @@ +# this is the configuration file we use for this script +# it is used for the database settings default: host: "192.168.1.1" username: "username" diff --git a/docker-resources/crontab b/docker-resources/crontab index 371b0d0..9bf0b98 100644 --- a/docker-resources/crontab +++ b/docker-resources/crontab @@ -1,2 +1,4 @@ +# 3:35 AM Daily 35 3 * * * /app/run_predictions.sh +# 1 AM Daily 00 1 * * * /app/run_daily_stored_procedures.sh diff --git a/docker-resources/docker-entrypoint.sh b/docker-resources/docker-entrypoint.sh index de718f1..6c13f6f 100755 --- a/docker-resources/docker-entrypoint.sh +++ b/docker-resources/docker-entrypoint.sh @@ -1,4 +1,5 @@ #!/bin/bash cron +# This is just the default command from the RPlumber image R -e "pr <- plumber::plumb(rev(commandArgs())[1]); args <- list(host = '0.0.0.0', port = 8000); if (packageVersion('plumber') >= '1.0.0') { pr\$setDocs(TRUE) } else { args\$swagger <- TRUE }; do.call(pr\$run, args)" "$@" diff --git a/docker-resources/plumber.R b/docker-resources/plumber.R index 6d123c2..3ccf6e9 100644 --- a/docker-resources/plumber.R +++ b/docker-resources/plumber.R @@ -14,18 +14,24 @@ h2o.init() dbConfig <- config::get() +# Update this when the model version changes ml_model_version <- "V7" +# this is the adult model; we only load it once ml_model_adult <- h2o.loadModel( "/app/model/y0_1days_adult_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_1_20230812_150159_auc_0.775/StackedEnsemble_BestOfFamily_1_AutoML_1_20230812_150159" ) +# this is the peds model ml_model_minor <- h2o.loadModel( "/app/model/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957_auc_0.734/StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957" ) +# here we also load the SQL script we use to extract data ml_sql <- read_file("/app/iit_prod_data_extract.sql") +# we setup a connection pool here +# strictly speak, a connection pool is probably overkill my_pool <- dbPool( drv = RMariaDB::MariaDB(), host = dbConfig$host, @@ -44,6 +50,7 @@ function(pr) { }) } +# this just ensures that the API always responds with the headers needed for to avoid CORS errors #* @filter cors cors <- function(req, res) { res$setHeader("Access-Control-Allow-Origin", "*") @@ -62,6 +69,8 @@ cors <- function(req, res) { } } +# This is the actual endpoint definition and the place the code really starts + #* @apiTitle AMPATH Interruption in Treatment Prediction Model API #* @apiDescription This API provides a simple method to run the model for a set of weeks @@ -72,11 +81,15 @@ cors <- function(req, res) { #* @serializer json #* @post /predict function( - startDate = NA, - weeks = "1", - retrospective = "F" + startDate = NA, # the startDate is the first day to start from + # note that it will be adjusted to the Monday of the week its in as we always run in weekly batches + weeks = "1", # the number of weeks to run; this is only used for testing + retrospective = "F" # whether or not the query is retrospective (run against past data for testing) or prospective + # (run normally); this mostly adjusts the query ) { retrospective <- as.logical(retrospective) + + # If the startDate is not specified, it defaults to NA and we set it to a week from today if (is.na(startDate)) { startDate = clock::add_weeks(Sys.Date(), 1) } @@ -92,6 +105,7 @@ function( # of the resuling week; this is our end date end_of_week <- week_end(clock::add_weeks(start_of_week, num_weeks)) + # here we plug the variables into the query query <- DBI::sqlInterpolate( my_pool, ml_sql, @@ -109,6 +123,7 @@ function( } # convert the dataframe to an h2o object, removing elements that are not predictors + # these also split the population by age h2o_predict_frame_adults <- predictors %>% filter(Age >= 18) %>% select(-c(person_id, encounter_id, location_id)) %>% @@ -124,15 +139,20 @@ function( # run the predictions # TODO Why does this seem to claim we're running in train / validate mode? results_adults <- h2o.predict(ml_model_adult, h2o_predict_frame_adults) - on.exit(h2o.rm(results_adults)) + on.exit(h2o.rm(results_adults)) # on.exit for clean-up + results_minors <- h2o.predict(ml_model_minor, h2o_predict_frame_minors) - on.exit(h2o.rm(results_minors)) + on.exit(h2o.rm(results_minors)) # on.exit for clean-up + # casting here ensures that these objects are copied as data frames, + # which makes things easier since most libraries can't interoperate with + # an H2OFrame results_adults_df <- as.data.frame(results_adults) results_minors_df <- as.data.frame(results_minors) # for the case where we need this, it should be safe to assume # that the start week has the correct values + # a cohort is the predictions generated for a given week cohort <- clock::date_format(clock::add_weeks(start_date, -1), format="%Y-W%U") # enrich the table of predictors with the results @@ -170,6 +190,7 @@ function( .keep = "unused" ) + # combine adult and peds results into one big frame prediction_result <- bind_rows(prediction_results_adults, prediction_results_minors) # add the rows from the prediction_result to the ml_weekly_predictions table @@ -193,17 +214,24 @@ week_start <- function(date) { clock::date_floor(date, "week", origin = as.Date("1970-01-05")) } +# sets origin to the first Sunday after 1970-01-01; this should guarantee that our +# ceiling is the Sunday of the specified date week_end <- function(date) { date <- as.Date(date) clock::date_ceiling(date, "week", origin = as.Date("1970-01-04")) } +# calculates the "week number" string for the week before the start date get_week_number <- function(date) { previous_week <- clock::add_weeks(week_start(date), -1) ywd <- clock::as_iso_year_week_day(previous_week) paste0(clock::get_year(ywd), "-W", stringr::str_pad(clock::get_week(ywd), 2, pad = "0")) } +# embedded SQL queries +# because the predictions are generated on Monday and then run on other days to catch newly added appointments +# but we want the thresholds to remain roughly the same, we use these queries to determine what the threshold +# was for this week to be considered "High Risk" or "Medium Risk" adult_risk_threshold_query <- "select 'Medium Risk' as risk, @@ -250,9 +278,10 @@ minor_risk_threshold_query <- and timestampdiff(YEAR, p.birthdate, mlp.rtc_date) < 18 group by location_id;" +# this is a utility function that mostly handles the risk thresholding predict_risk <- function(.data, cohort, age_category) { # arbitrary cut-off, but we expect one big batch per week - # and several small batches + # and several small batches; small batches are handled by this if if (nrow(.data) < 200) { cutoffs <- DBI::dbGetQuery( my_pool, @@ -297,6 +326,9 @@ predict_risk <- function(.data, cohort, age_category) { } } + # for large batches, we calculate the thresholds from the predictions themselves + # the scoring system is that the 90th percentile of risk score are "High Risk" and the 80th percentile are "Medium Risk" + # we also break this down by location, so every location should have about 20% of its weekly visits flagged .data %>% group_by(location_id) %>% mutate( diff --git a/docker-resources/run_daily_stored_procedures.sh b/docker-resources/run_daily_stored_procedures.sh index 7ee8c97..593d231 100755 --- a/docker-resources/run_daily_stored_procedures.sh +++ b/docker-resources/run_daily_stored_procedures.sh @@ -1,3 +1,5 @@ #!/bin/bash +# this is a really simple script but it's easier to run a Bash script from cron +# and R from the Bash script than running this whole setup from cron cd /app Rscript dailyStoredProcedures.R diff --git a/docker-resources/run_predictions.sh b/docker-resources/run_predictions.sh index 641d5d9..26d3372 100644 --- a/docker-resources/run_predictions.sh +++ b/docker-resources/run_predictions.sh @@ -1,2 +1,4 @@ #!/bin/bash + +# the defaults for the API are setup for this, so we just hit the end point curl -X POST http://localhost:8000/predict