From dc77521fb71b0024f9b7d8db00c5f3e509f3992d Mon Sep 17 00:00:00 2001
From: Ian <ian.c.bacher@gmail.com>
Date: Mon, 15 Apr 2024 16:22:12 -0400
Subject: [PATCH] Add documentation on everything

---
 Dockerfile                                    |  59 +++--
 IIT-Prediction/version.md                     | 197 +++++++-------
 README.md                                     |   7 +-
 SQL/iit_prod_data_extract.sql                 | 248 +++++++++---------
 docker-resources/config.example.yml           |   2 +
 docker-resources/crontab                      |   2 +
 docker-resources/docker-entrypoint.sh         |   1 +
 docker-resources/plumber.R                    |  44 +++-
 .../run_daily_stored_procedures.sh            |   2 +
 docker-resources/run_predictions.sh           |   2 +
 10 files changed, 321 insertions(+), 243 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4600b7a..cc1317f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,48 +2,71 @@ FROM rstudio/plumber:latest
 
 ENV TZ "Africa/Nairobi"
 
+# install base libraries we need
 RUN apt-get -y update -qq && apt-get -y --no-install-recommends install \
-	tini \
-	libmariadb-dev \
-	libmysqlclient21 \
-	openjdk-8-jdk-headless \
-	cron \
-	curl \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& rm -rf /etc/cron.*/*
+    tini \
+    libmariadb-dev \
+    libmysqlclient21 \
+    openjdk-8-jdk-headless \
+    cron \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf /etc/cron.*/*
 
+# install the R packages we need
+# for these, the latest version should always be usable
 RUN install2.r --error --skipinstalled \
-	tidyverse \
-	pool \
-	clock \
-	config \
-	uuid \
-	readr \
-	RMariaDB \
-	DBI \
-	&& rm -rf /tmp/downloaded_packages
+    tidyverse \
+    pool \
+    clock \
+    config \
+    uuid \
+    readr \
+    RMariaDB \
+    DBI \
+    && rm -rf /tmp/downloaded_packages
 
+# The model always needs to run on the exact version of h2o used to train it
 RUN Rscript -e "remotes::install_version('h2o', '3.42.0.2')"
 
+# Add the prediction model to the app
 COPY IIT-Prediction/model/V7 /app/model
+# Add the production extraction query to the app
 COPY SQL/iit_prod_data_extract.sql /app/iit_prod_data_extract.sql
+
+# The next scripts are used for cron jobs
+# this script triggers the predictions to run by hitting the API endpoint
 COPY docker-resources/run_predictions.sh /app/run_predictions.sh
 RUN chmod 0744 /app/run_predictions.sh
+# this script runs a small number of stored procedures we depend on to update
+# various tables used in the predictions
 COPY docker-resources/run_daily_stored_procedures.sh /app/run_daily_stored_procedures.sh
 RUN chmod 0744 /app/run_daily_stored_procedures.sh
+
+# this is a Docker entrypoint script
+# it ensures the cron daemon is started and then runs the API
 COPY docker-resources/docker-entrypoint.sh /docker-entrypoint.sh
 RUN chmod 0744 /docker-entrypoint.sh
 
+# here we actually setup the cron jobs, using our source crontab
+# cron is _very_ picky, so it may not be best to mess with this
 COPY docker-resources/crontab /etc/cron.d/iit-crontab
 RUN chmod 0644 /etc/cron.d/iit-crontab
 RUN crontab -u root /etc/cron.d/iit-crontab
-RUN touch /var/log/cron.log
 
+# now we also need to add the R code used here
+# this R code actually runs the stored procedures for run_daily_stored_procedures.sh
+# this is done in R so we can re-use the database settings for the API
 COPY docker-resources/dailyStoredProcedures.R /app/dailyStoredProcedures.R
+# plumber.R is the main app
 COPY docker-resources/plumber.R /app/plumber.R
 
+# EXPOSE is just documentation; by default, the API is run on port 8000
+# In production, this port is not exposed, as we hit the API from inside the container
 EXPOSE 8000
 
+# setup the entrypoint
 ENTRYPOINT ["tini", "--", "/docker-entrypoint.sh"]
 
+# this may not be necessary, but its left in to match the parent container defaults
 CMD ["/app/plumber.R"]
diff --git a/IIT-Prediction/version.md b/IIT-Prediction/version.md
index bc1e00f..10713a9 100644
--- a/IIT-Prediction/version.md
+++ b/IIT-Prediction/version.md
@@ -1,4 +1,3 @@
-
 # Premise
 
 This file is used to keep track of model version metadata
@@ -15,22 +14,20 @@ TODO:
 
 TODO:
 
-
 ## V4
 
 TODO:
 
-
 ## V5
 
 TODO:
 
 ## V6
 
-Version 6 of the model is trained using a larger dataset (All AMPATH care data)  and more recent dataset (Anyone with an encounter after 2021):
+Version 6 of the model is trained using a larger dataset (All AMPATH care data) and more recent dataset (Anyone with an encounter after 2021):
 
-* "Cohort 2021 Patients: 95101"
-* "Cohort 2021 Visits: 859184"
+- "Cohort 2021 Patients: 95101"
+- "Cohort 2021 Visits: 859184"
 
 Some predictors have been added while others have been removed. Here is a list of the new predictors used to train the model
 
@@ -39,56 +36,55 @@ Some predictors have been added while others have been removed. Here is a list o
       HIV_disclosure_stage = if_else(is.na(hiv_disclosure_status_value),"Not Done",hiv_disclosure_status_value),
       Clinic_County=Clinic_County,
       Clinic_Name =Clinic_Name,
-      Program_Name = if_else(is.na(Program_Name),"Unknown",Program_Name),     
+      Program_Name = if_else(is.na(Program_Name),"Unknown",Program_Name),
       # New Vars
       TB_screening = tb_screen,
-      TB_Test_Result =factor(tb_test_result), 
+      TB_Test_Result =factor(tb_test_result),
       On_TB_TX = on_tb_tx,
       On_IPT = on_ipt,
       CA_CX_Screening =if_else(is.na(ca_cx_screening),0,ca_cx_screening),
       CA_CX_Screening_Result = factor(if_else(is.na(ca_cx_screening_result),1118,ca_cx_screening_result))
 ```
 
-
 Also here is a list of all predictors:
 
 ```
 
 X=c(
-  
-  c(    'Age','Age_NA', 
-        'Gender' ,  
-        'Duration_in_HIV_care', 'Duration_in_HIV_care_NA',  
+
+  c(    'Age','Age_NA',
+        'Gender' ,
+        'Duration_in_HIV_care', 'Duration_in_HIV_care_NA',
         'BMI', 'BMI_NA',
-        #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', 
+        #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA',
         'WHO_staging','WHO_staging_NA',
         'Viral_Load_log10', 'Viral_Load_log10_NA', 'VL_suppression', 'Days_Since_Last_VL',
-        'HIV_disclosure','HIV_disclosure_NA', 
-        'Regimen_Line', 'Regimen_Line_NA',  
+        'HIV_disclosure','HIV_disclosure_NA',
+        'Regimen_Line', 'Regimen_Line_NA',
         'Pregnancy',
         'CD4','CD4_NA', 'Days_Since_Last_CD4',
         "Encounter_Type_Class",
         'ART_regimen',
-        'Visit_Number', 
+        'Visit_Number',
         'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA',
          'num_2wks_defaults_last_3visits', 'num_2wks_defaults_last_3visits_NA',
         'ever_defaulted_by_1m_in_last_1year','ever_defaulted_by_1m_in_last_1year_NA',
          'ever_defaulted_by_1m_in_last_2year','ever_defaulted_by_1m_in_last_2year_NA',
-        
+
         # Baseline
         'Age_baseline',
-        'Gender_baseline' ,  
+        'Gender_baseline' ,
         'BMI_baseline',
         'WHO_staging_baseline',
-        'VL_suppression_baseline', 
+        'VL_suppression_baseline',
         'Viral_Load_log10_baseline',
         'HIV_disclosure_baseline',
-        'Regimen_Line_baseline', 
+        'Regimen_Line_baseline',
         'Pregnancy_baseline',
         'CD4_baseline',
-        "Clinic_Name_baseline", 
+        "Clinic_Name_baseline",
         'ART_regimen_baseline',
-        
+
         # New Vars
       'ART_Adherence',
       'HIV_disclosure_stage',
@@ -96,35 +92,32 @@ X=c(
       'Clinic_Name',
       'Program_Name',
       'TB_screening',
-      'TB_Test_Result', 
+      'TB_Test_Result',
       'On_TB_TX',
       'On_IPT',
       'CA_CX_Screening',
       'CA_CX_Screening_Result'
-        
-        
-    
+
+
+
     )
-  
-  
+
+
 )
 
 
 ```
 
-
 ### Model to use?
 
 2_StackedEnsemble_BestOfFamily_1_AutoML_8_20230726_142520_auc_0.704
 
-
 ## V7
 
-
 Version 7 of the model is trained using 2 cohorts of datasets:
 
-* Adult	88,809	(93.383876%)
-* Minor	6,292	(6.616124%)
+- Adult 88,809 (93.383876%)
+- Minor 6,292 (6.616124%)
 
 With these changes, the cross-validated AUC has increased from ~70 to ~77
 
@@ -141,7 +134,6 @@ Here is a list of the new predictors that have been added
       'num_1month_defaults_last_3visits'
 ```
 
-
 Please see the util files on how these variables are define
 
 ```
@@ -149,18 +141,17 @@ Please see the util files on how these variables are define
 
       num_1day_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1day_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)),
       num_1day_defaults_last_3visits = if_else(is.na(num_1day_defaults_last_3visits), 0, num_1day_defaults_last_3visits),
-      
+
       num_7days_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-7days_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)),
       num_7days_defaults_last_3visits = if_else(is.na(num_7days_defaults_last_3visits), 0, num_7days_defaults_last_3visits),
-      
+
       num_1month_defaults_last_3visits = as.double(lag(rollapplyr(`disengagement-1month_bin`, 3, sum, partial =TRUE),order_by = Encounter_ID)),
       num_1month_defaults_last_3visits = if_else(is.na(num_1month_defaults_last_3visits), 0, num_1month_defaults_last_3visits),
-       
+
 ```
 
 ### Removed predictors
 
-
 Here is a list of the old predictors that have been removed
 
 ```
@@ -168,8 +159,6 @@ Here is a list of the old predictors that have been removed
       'num_2wks_defaults_last_3visits_NA'
 ```
 
-
-
 ### All predictors
 
 Finally here is a list of all predictors:
@@ -178,72 +167,71 @@ Finally here is a list of all predictors:
 
 
 X=c(
-  
-  c(    'Age','Age_NA', 
-        'Gender' ,  
-        'Duration_in_HIV_care', 'Duration_in_HIV_care_NA',  
+
+  c(    'Age','Age_NA',
+        'Gender' ,
+        'Duration_in_HIV_care', 'Duration_in_HIV_care_NA',
         'BMI', 'BMI_NA',
-        #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA', 
+        #'Days_to_Start_of_ART', 'Days_to_Start_of_ART_NA',
         'WHO_staging','WHO_staging_NA',
-        'Viral_Load_log10', 'Viral_Load_log10_NA', # REMOVED (V7) 'VL_suppression', 
+        'Viral_Load_log10', 'Viral_Load_log10_NA', # REMOVED (V7) 'VL_suppression',
         'Days_Since_Last_VL',
-        'HIV_disclosure','HIV_disclosure_NA', 
-        'Regimen_Line', 'Regimen_Line_NA',  
+        'HIV_disclosure','HIV_disclosure_NA',
+        'Regimen_Line', 'Regimen_Line_NA',
         'Pregnancy',
         'CD4','CD4_NA', 'Days_Since_Last_CD4',
         "Encounter_Type_Class",
         'ART_regimen',
-        'Visit_Number', 
+        'Visit_Number',
         'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA',
          'num_2wks_defaults_last_3visits', # REMOVED (V7) 'num_2wks_defaults_last_3visits_NA',
         'ever_defaulted_by_1m_in_last_1year','ever_defaulted_by_1m_in_last_1year_NA',
          'ever_defaulted_by_1m_in_last_2year','ever_defaulted_by_1m_in_last_2year_NA',
-        
+
         # Baseline
         'Age_baseline',
-        'Gender_baseline' ,  
+        'Gender_baseline' ,
         'BMI_baseline',
         'WHO_staging_baseline',
-        'VL_suppression_baseline', 
+        'VL_suppression_baseline',
         'Viral_Load_log10_baseline',
         'HIV_disclosure_baseline',
-        'Regimen_Line_baseline', 
+        'Regimen_Line_baseline',
         'Pregnancy_baseline',
         'CD4_baseline',
-        "Clinic_Name_baseline", 
+        "Clinic_Name_baseline",
         'ART_regimen_baseline',
-        
+
         # New Vars (V6)
       'ART_Adherence',
       'HIV_disclosure_stage',
       'Clinic_County',
       'Clinic_Name',
-      'Program_Name',     
+      'Program_Name',
       'TB_screening',
-      'TB_Test_Result', 
+      'TB_Test_Result',
       'On_TB_TX',
       'On_IPT',
       'CA_CX_Screening',
       'CA_CX_Screening_Result',
-      
+
       # New Var (V7)
       'Month',
       'num_1day_defaults_last_3visits',
       'num_7days_defaults_last_3visits',
       'num_1month_defaults_last_3visits'
-      
-        
-        
-    
+
+
+
+
     )
-  
-  
+
+
 )
 
 
 ```
 
-
 ### Model to use?
 
 #### Adult Model
@@ -253,11 +241,10 @@ IIT-Prediction/model/V7/y0_1days_adult_IIT/1_StackedEnsemble_BestOfFamily_1_Auto
 Note: Please remember to factorize all character predictors before scoring
 
 ```
-clean.df= clean.long.df %>% 
-      mutate_if(is.character, as.factor)   
+clean.df= clean.long.df %>%
+      mutate_if(is.character, as.factor)
 ```
 
-
 ### Minor Model
 
 IIT-Prediction/model/V7/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957_auc_0.734
@@ -265,28 +252,53 @@ IIT-Prediction/model/V7/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoM
 Note: Please remember to factorize all character predictors before scoring as shown below
 
 ```
-clean.df= clean.long.df %>% 
-      mutate_if(is.character, as.factor)   
+clean.df= clean.long.df %>%
+      mutate_if(is.character, as.factor)
 ```
 
 ### Monitoring
 
 Please save logs especially warning logs which we can use to track any drift in concept or bad variables.
 
+## V8
 
-## V9
+### Removed Predictors
 
+```
+    'TB_screening',
+    'On_TB_TX',
+    'On_IPT',
+    'CA_CX_Screening',
+    'CA_CX_Screening_Result',
+     'num_2wks_defaults_last_3visits',
+      'ever_defaulted_by_1m_in_last_1year',
+     'ever_defaulted_by_1m_in_last_1year_NA',
+       'ever_defaulted_by_1m_in_last_2year',
+     'ever_defaulted_by_1m_in_last_2year_NA',
+      'Age_baseline',
+      'Gender_baseline' ,
+      'BMI_baseline',
+      'WHO_staging_baseline',
+      'VL_suppression_baseline',
+      'Viral_Load_log10_baseline',
+      'HIV_disclosure_baseline',
+      'Regimen_Line_baseline',
+      'Pregnancy_baseline',
+      'CD4_baseline',
+      "Clinic_Name_baseline",
+      'ART_regimen_baseline',
+```
 
+## V9
 
 Version 9 of the model is trained using 2 cohorts of datasets:
 
-* Adult -  up to 04-04-2024
-* Minor -  up to 04-04-2024
+- Adult - up to 04-04-2024
+- Minor - up to 04-04-2024
 
 Facility level predictors have been added (please see the csv shared via drive)
 We have simplified the model by removing some predictors
 
-
 ### New predictors
 
 Here is a list of the new predictors that have been added
@@ -311,11 +323,8 @@ Here is a list of the new predictors that have been added
      'Facility Type'
 ```
 
-
-
 ### Removed predictors
 
-
 Here is a list of the old predictors that have been removed
 
 ```
@@ -324,8 +333,6 @@ Here is a list of the old predictors that have been removed
       "Clinic_Name_baseline",  # Removed in V9
 ```
 
-
-
 ### All predictors
 
 Finally here is a list of all predictors:
@@ -334,11 +341,11 @@ Finally here is a list of all predictors:
 
 
 X=c(
-  
-   c(    
-   
-   'Age','Age_NA', 
-    'Gender' ,  
+
+   c(
+
+   'Age','Age_NA',
+    'Gender' ,
    'num_1day_defaults_last_3visits',
    'Current_Clinic_County',
    'Days_defaulted_in_prev_enc', 'Days_defaulted_in_prev_enc_NA',
@@ -348,42 +355,40 @@ X=c(
    'Days_Since_Last_VL',  'Days_Since_Last_VL_NA',
    'Visit_Number',  'HIV_disclosure_stage', 'HIV_disclosure_baseline_NA',
    'Program_Name', 'Days_Since_Last_CD4', 'Days_Since_Last_CD4_NA',
-   'Month', 'TB_Test_Result', 
+   'Month', 'TB_Test_Result',
     'Viral_Load_log10', 'Viral_Load_log10_NA',
    'BMI', 'BMI_NA',
    'CD4','CD4_NA',  'Facility Type'
     )
-  
-  
+
+
 )
 
 
 ```
 
-
 ### Model to use?
 
 #### Adult Model
 
-IIT-Prediction/model/V9/y0_1days_adult_IIT/1_StackedEnsemble_...
+IIT-Prediction/model/V9/y0*1days_adult_IIT/1_StackedEnsemble*...
 
 Note: Please remember to factorize all character predictors before scoring
 
 ```
-clean.df= clean.long.df %>% 
-      mutate_if(is.character, as.factor)   
+clean.df= clean.long.df %>%
+      mutate_if(is.character, as.factor)
 ```
 
-
 ### Minor Model
 
-IIT-Prediction/model/V9/y0_1day_minor_IIT/1_StackedEnsemble_...
+IIT-Prediction/model/V9/y0*1day_minor_IIT/1_StackedEnsemble*...
 
 Note: Please remember to factorize all character predictors before scoring as shown below
 
 ```
-clean.df= clean.long.df %>% 
-      mutate_if(is.character, as.factor)   
+clean.df= clean.long.df %>%
+      mutate_if(is.character, as.factor)
 ```
 
 ### Monitoring
diff --git a/README.md b/README.md
index 849a3b8..ba64019 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
 # clinical-prediction-ml-models
-The main objective of this project is to predict clinical outcomes using EHR Data and modern ML/AI 
 
-* IIT-Prediction - Start [Here](IIT-Prediction/README.md)
+The main objective of this project is to predict clinical outcomes using EHR Data and modern ML/AI
+
+- IIT-Prediction - Start [Here](IIT-Prediction/README.md)
+- Technical Documentation - Start [Here](readme/technical_documentation.md)
 
 ## Docker Image
 
 To build the Docker image, first create a `config.yml` using the file in the `docker-resources` folder called `config.example.yml` as a guide. Then build the Docker image:
 
-
 ```
 docker build --tag ampath-iit-prediction-model-<model_version>:<version> .
 ```
diff --git a/SQL/iit_prod_data_extract.sql b/SQL/iit_prod_data_extract.sql
index cecc468..71588a3 100644
--- a/SQL/iit_prod_data_extract.sql
+++ b/SQL/iit_prod_data_extract.sql
@@ -23,84 +23,84 @@ with num_1day_defaults_last_3_visits as (
                        on dd3.person_id = dd2.person_id
                            and dd3.visit_number = dd2.visit_number - 1
 ),
-     num_7day_defaults_last_3_visits as (
-         select
-             dd1.person_id,
-             dd1.encounter_id,
-             dd1.visit_number,
-             case
-                when dd1.days_defaulted_last_encounter is null or
-                     dd2.days_defaulted_last_encounter is null or
-                     dd3.days_defaulted_last_encounter is null
-                    then null
-                else
-                    if(dd1.days_defaulted_last_encounter >= 7, 1, 0) +
-                    if(dd2.days_defaulted_last_encounter >= 7, 1, 0) +
-                    if(dd3.days_defaulted_last_encounter >= 7, 1, 0)
-            end as num_7day_defaults_last_3_visits
-         from predictions.flat_ml_days_defaulted dd1
-                  left join predictions.flat_ml_days_defaulted dd2
-                            on dd2.person_id = dd1.person_id
-                                and dd2.visit_number = dd1.visit_number - 1
-                  left join predictions.flat_ml_days_defaulted dd3
-                            on dd3.person_id = dd2.person_id
-                                and dd3.visit_number = dd2.visit_number - 1
-     ),
-     num_2wk_defaults_last_3_visits as (
-         select
-             dd1.person_id,
-             dd1.encounter_id,
-             dd1.visit_number,
-             case
-                 when dd1.days_defaulted_last_encounter is null or
-                      dd2.days_defaulted_last_encounter is null or
-                      dd3.days_defaulted_last_encounter is null
-                     then null
-                 else
-                    if(dd1.days_defaulted_last_encounter >= 14, 1, 0) +
-                    if(dd2.days_defaulted_last_encounter >= 14, 1, 0) +
-                    if(dd3.days_defaulted_last_encounter >= 14, 1, 0)
-                 end as num_2wks_defaults_last_3visits
-         from predictions.flat_ml_days_defaulted dd1
-                  left join predictions.flat_ml_days_defaulted dd2
-                            on dd2.person_id = dd1.person_id
-                                and dd2.visit_number = dd1.visit_number - 1
-                  left join predictions.flat_ml_days_defaulted dd3
-                            on dd3.person_id = dd2.person_id
-                                and dd3.visit_number = dd2.visit_number - 1
-     ),
-     num_1month_defaults_last_3_visits as (
-         select
-             dd1.person_id,
-             dd1.encounter_id,
-             dd1.visit_number,
-             case
-                 when dd1.days_defaulted_last_encounter is null or
-                      dd2.days_defaulted_last_encounter is null or
-                      dd3.days_defaulted_last_encounter is null
-                     then null
-                 else
-                    if(dd1.days_defaulted_last_encounter >= 30, 1, 0) +
-                    if(dd2.days_defaulted_last_encounter >= 30, 1, 0) +
-                    if(dd3.days_defaulted_last_encounter >= 30, 1, 0)
-                 end as num_1month_defaults_last_3_visits
-         from predictions.flat_ml_days_defaulted dd1
-                  left join predictions.flat_ml_days_defaulted dd2
-                            on dd2.person_id = dd1.person_id
-                                and dd2.visit_number = dd1.visit_number - 1
-                  left join predictions.flat_ml_days_defaulted dd3
-                            on dd3.person_id = dd2.person_id
-                                and dd3.visit_number = dd2.visit_number - 1
-     ),
-     defaults_by_days as (
-         select
-             dd.person_id,
-             dd.encounter_id,
-             encounter_date,
-             max(dd.days_defaulted_last_encounter) as days_defaulted
-         from predictions.flat_ml_days_defaulted dd
-         group by dd.person_id, encounter_date
-     )
+num_7day_defaults_last_3_visits as (
+    select
+        dd1.person_id,
+        dd1.encounter_id,
+        dd1.visit_number,
+        case
+        when dd1.days_defaulted_last_encounter is null or
+                dd2.days_defaulted_last_encounter is null or
+                dd3.days_defaulted_last_encounter is null
+            then null
+        else
+            if(dd1.days_defaulted_last_encounter >= 7, 1, 0) +
+            if(dd2.days_defaulted_last_encounter >= 7, 1, 0) +
+            if(dd3.days_defaulted_last_encounter >= 7, 1, 0)
+    end as num_7day_defaults_last_3_visits
+    from predictions.flat_ml_days_defaulted dd1
+            left join predictions.flat_ml_days_defaulted dd2
+                    on dd2.person_id = dd1.person_id
+                        and dd2.visit_number = dd1.visit_number - 1
+            left join predictions.flat_ml_days_defaulted dd3
+                    on dd3.person_id = dd2.person_id
+                        and dd3.visit_number = dd2.visit_number - 1
+),
+num_2wk_defaults_last_3_visits as (
+    select
+        dd1.person_id,
+        dd1.encounter_id,
+        dd1.visit_number,
+        case
+            when dd1.days_defaulted_last_encounter is null or
+                dd2.days_defaulted_last_encounter is null or
+                dd3.days_defaulted_last_encounter is null
+                then null
+            else
+            if(dd1.days_defaulted_last_encounter >= 14, 1, 0) +
+            if(dd2.days_defaulted_last_encounter >= 14, 1, 0) +
+            if(dd3.days_defaulted_last_encounter >= 14, 1, 0)
+            end as num_2wks_defaults_last_3visits
+    from predictions.flat_ml_days_defaulted dd1
+            left join predictions.flat_ml_days_defaulted dd2
+                    on dd2.person_id = dd1.person_id
+                        and dd2.visit_number = dd1.visit_number - 1
+            left join predictions.flat_ml_days_defaulted dd3
+                    on dd3.person_id = dd2.person_id
+                        and dd3.visit_number = dd2.visit_number - 1
+),
+num_1month_defaults_last_3_visits as (
+    select
+        dd1.person_id,
+        dd1.encounter_id,
+        dd1.visit_number,
+        case
+            when dd1.days_defaulted_last_encounter is null or
+                dd2.days_defaulted_last_encounter is null or
+                dd3.days_defaulted_last_encounter is null
+                then null
+            else
+            if(dd1.days_defaulted_last_encounter >= 30, 1, 0) +
+            if(dd2.days_defaulted_last_encounter >= 30, 1, 0) +
+            if(dd3.days_defaulted_last_encounter >= 30, 1, 0)
+            end as num_1month_defaults_last_3_visits
+    from predictions.flat_ml_days_defaulted dd1
+            left join predictions.flat_ml_days_defaulted dd2
+                    on dd2.person_id = dd1.person_id
+                        and dd2.visit_number = dd1.visit_number - 1
+            left join predictions.flat_ml_days_defaulted dd3
+                    on dd3.person_id = dd2.person_id
+                        and dd3.visit_number = dd2.visit_number - 1
+),
+defaults_by_days as (
+    select
+        dd.person_id,
+        dd.encounter_id,
+        encounter_date,
+        max(dd.days_defaulted_last_encounter) as days_defaulted
+    from predictions.flat_ml_days_defaulted dd
+    group by dd.person_id, encounter_date
+)
 -- describe the columns we need
 select
     fs.person_id,
@@ -113,7 +113,7 @@ select
     p.gender as Gender,
     null as Marital_status,
     timestampdiff(year,
-                  if(year(fs.arv_first_regimen_start_date) != 1900,
+                  if(year(fs.arv_first_regimen_start_date) != 1900,  -- 1900 indicates junk data
                      date(fs.arv_first_regimen_start_date),
                      null
                       ),
@@ -121,6 +121,8 @@ select
         ) as Duration_in_HIV_care,
     if(fs.arv_first_regimen_start_date is null or year(fs.arv_first_regimen_start_date) = 1900,
        1, 0) as Duration_in_HIV_care_NA,
+    -- BMI = wt / (ht / 100)^2
+    -- BMI < 5.0 or over 60.0 are considered errors, usually errors in the underlying data
     case
         when fs.weight is null or fs.height is null or fs.weight < 1 or fs.height < 1
             then null
@@ -148,10 +150,11 @@ select
     timestampdiff(DAY, fs.encounter_datetime, fs.vl_resulted_date) as Days_Since_Last_VL,
     fs.hiv_status_disclosed as HIV_disclosure,
     if(fs.hiv_status_disclosed is null, 1, 0) as HIV_disclosure_NA,
-    -- NB Regimen Line differs from extraction data
+    -- Regimen Line data frequently differs forom the training data
     fs.cur_arv_line as Regimen_Line,
     if(fs.cur_arv_line is null, 1, 0) as Regimen_Line_NA,
     coalesce(fs.is_pregnant, 0) as Pregnancy,
+    -- manual look-up table for site characteristics
     case
         when fs.location_id in (
             -- Dumisha
@@ -178,12 +181,15 @@ select
         when et.name in ('ADULTINITIAL', 'PEDSINITIAL', 'YOUTHINITIAL') then 'Initial'
         when et.name in ('ADULTRETURN', 'PEDSRETURN', 'YOUTHRETURN') then 'Return'
         else 'Other'
-        end as Encounter_Type_Class,
+    end as Encounter_Type_Class,
     null as Education_Level,
     null as Occupation,
     null as Adherence_Counselling_Sessions,
     l.name as Clinic_Name,
     replace(etl.get_arv_names(fs.cur_arv_meds), '##', '+') as ART_regimen,
+    -- flat_hiv_summary has a visit_number value, but its a total counter
+    -- the model is trained on data from 2021, so we recalculate the visit number from the
+    -- default data
     dd.visit_number as Visit_Number,
     days_defaulted_last_encounter as Days_defaulted_in_prev_enc,
     if(days_defaulted_last_encounter is null, 1, 0) as Days_defaulted_in_prev_enc_NA,
@@ -231,53 +237,55 @@ select
     fs.ca_cx_screening_result as CA_CX_Screening_Result,
     convert(month(date(fs.rtc_date)), char) as 'Month'
 from etl.flat_hiv_summary_v15b as fs
-         left join predictions.flat_ml_baseline_visit baseline
-                   on fs.person_id = baseline.person_id
-         left join predictions.flat_ml_days_defaulted dd
-                   on dd.encounter_id = fs.encounter_id
-                       and dd.person_id = fs.person_id
-         join amrs.person p on p.person_id = fs.person_id
-         left join amrs.encounter_type et on fs.encounter_type = et.encounter_type_id
-         left join amrs.location l
-                   on fs.location_id = l.location_id
-                       and l.retired = 0
-    -- If a patient in enrolled in PMTCT, they are also enrolled in antenatal care
-    -- Currently, we only keep the PMTCT record
-         left join etl.program_visit_map pvm
-                   on pvm.visit_type_id = fs.visit_type
-                       and pvm.voided is null
-                       and (pvm.program_type_id != 42 or pvm.visit_type_id not in (51, 54))
-                       and (pvm.program_type_id != 52 or pvm.visit_type_id not in (1, 2))
-         left join amrs.program program
-                   on pvm.program_type_id = program.program_id
-                       and program.retired = 0
-         left join num_1day_defaults_last_3_visits 1day_defaults
-                   on 1day_defaults.person_id = fs.person_id
-                       and 1day_defaults.encounter_id = fs.encounter_id
-         left join num_7day_defaults_last_3_visits 7day_defaults
-                   on 7day_defaults.person_id = fs.person_id
-                       and 7day_defaults.encounter_id = fs.encounter_id
-         left join num_2wk_defaults_last_3_visits 2wk_defaults
-                   on 2wk_defaults.person_id = fs.person_id
-                       and 2wk_defaults.encounter_id = fs.encounter_id
-         left join num_1month_defaults_last_3_visits 1month_defaults
-                   on 1month_defaults.person_id = fs.person_id
-                       and 1month_defaults.encounter_id = fs.encounter_id
-         left join (
+        left join predictions.flat_ml_baseline_visit baseline
+                on fs.person_id = baseline.person_id
+        left join predictions.flat_ml_days_defaulted dd
+                on dd.encounter_id = fs.encounter_id
+                    and dd.person_id = fs.person_id
+        join amrs.person p on p.person_id = fs.person_id
+        left join amrs.encounter_type et on fs.encounter_type = et.encounter_type_id
+        left join amrs.location l
+                on fs.location_id = l.location_id
+                    and l.retired = 0
+        -- If a patient in enrolled in PMTCT, they are also enrolled in antenatal care
+        -- Currently, we only keep the PMTCT record
+        left join etl.program_visit_map pvm
+                on pvm.visit_type_id = fs.visit_type
+                    and pvm.voided is null
+                    and (pvm.program_type_id != 42 or pvm.visit_type_id not in (51, 54))
+                    and (pvm.program_type_id != 52 or pvm.visit_type_id not in (1, 2))
+        left join amrs.program program
+                on pvm.program_type_id = program.program_id
+                    and program.retired = 0
+        left join num_1day_defaults_last_3_visits 1day_defaults
+                on 1day_defaults.person_id = fs.person_id
+                    and 1day_defaults.encounter_id = fs.encounter_id
+        left join num_7day_defaults_last_3_visits 7day_defaults
+                on 7day_defaults.person_id = fs.person_id
+                    and 7day_defaults.encounter_id = fs.encounter_id
+        left join num_2wk_defaults_last_3_visits 2wk_defaults
+                on 2wk_defaults.person_id = fs.person_id
+                    and 2wk_defaults.encounter_id = fs.encounter_id
+        left join num_1month_defaults_last_3_visits 1month_defaults
+                on 1month_defaults.person_id = fs.person_id
+                    and 1month_defaults.encounter_id = fs.encounter_id
+        left join (
             select person_id, if(days_defaulted >= 30, 1, 0) as any_30d_defaults_1yr
             from defaults_by_days
             where encounter_date between date_sub(?startDate, interval 1 year) and ?startDate
             group by person_id
         ) as 1yr on 1yr.person_id = fs.person_id
-         left join (
+        left join (
             select person_id, if(days_defaulted >= 30, 1, 0) as any_30d_defaults_2yr
             from defaults_by_days
             where encounter_date between date_sub(?startDate, interval 2 year) and ?startDate
             group by person_id
         ) as 2yr on 2yr.person_id = fs.person_id
-         left join predictions.ml_weekly_predictions mlp
-                   on mlp.encounter_id = fs.encounter_id
-where fs.location_id in (
+        left join predictions.ml_weekly_predictions mlp
+                on mlp.encounter_id = fs.encounter_id
+where
+  -- filter to only targetted locations
+  fs.location_id in (
     -- Dumisha
     26, 23, 319, 130, 313, 9, 78, 310, 20, 312, 12, 321, 8, 341, 342, 65, 314, 64, 83, 90, 106, 86, 336, 91, 320, 74, 76, 79, 100, 311, 75, 195, 19, 230,
     -- Uzima
@@ -285,8 +293,6 @@ where fs.location_id in (
     -- April 2024 rollout (NB some are included above - 420, 421, 422, & 423)
     211, 60, 323, 140, 4, 322, 351, 352, 208, 69, 208, 11, 229
   )
-  -- test locations and (21 - Non AMPATH Site, 22 - None)
-  and fs.location_id not in (21, 22, 429, 430, 354)
   -- filter encounters: 111 - LabResult, 99999 - lab encounter type
   -- these encounters are post-visit lab result entries and should not appear in predicted data
   and fs.encounter_type not in (111, 99999)
@@ -297,11 +303,13 @@ where fs.location_id in (
   -- returned to normal status at whatever clinic they visit
   and (fs.transfer_in_location_id is null or fs.transfer_in_location_id != 9999)
   and fs.is_clinical_encounter = 1
+  -- substituted from the R script
   and fs.rtc_date between ?startDate and ?endDate
   and (fs.next_clinical_datetime_hiv is null
     or (?retrospective and fs.next_clinical_datetime_hiv >= fs.rtc_date)
   )
   and fs.encounter_datetime < fs.date_created
+  -- filter dead patients
   and fs.death_date is null
   -- if not run retrospectively, don't generate new predictions for existing cases
   and (?retrospective or mlp.encounter_id is null);
diff --git a/docker-resources/config.example.yml b/docker-resources/config.example.yml
index 006edec..4d4f2f9 100644
--- a/docker-resources/config.example.yml
+++ b/docker-resources/config.example.yml
@@ -1,3 +1,5 @@
+# this is the configuration file we use for this script
+# it is used for the database settings
 default:
   host: "192.168.1.1"
   username: "username"
diff --git a/docker-resources/crontab b/docker-resources/crontab
index 371b0d0..9bf0b98 100644
--- a/docker-resources/crontab
+++ b/docker-resources/crontab
@@ -1,2 +1,4 @@
+# 3:35 AM Daily
 35 3 * * * /app/run_predictions.sh
+# 1 AM Daily
 00 1 * * * /app/run_daily_stored_procedures.sh
diff --git a/docker-resources/docker-entrypoint.sh b/docker-resources/docker-entrypoint.sh
index de718f1..6c13f6f 100755
--- a/docker-resources/docker-entrypoint.sh
+++ b/docker-resources/docker-entrypoint.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
 cron
+# This is just the default command from the RPlumber image
 R -e "pr <- plumber::plumb(rev(commandArgs())[1]); args <- list(host = '0.0.0.0', port = 8000); if (packageVersion('plumber') >= '1.0.0') { pr\$setDocs(TRUE) } else { args\$swagger <- TRUE }; do.call(pr\$run, args)" "$@"
diff --git a/docker-resources/plumber.R b/docker-resources/plumber.R
index 6d123c2..3ccf6e9 100644
--- a/docker-resources/plumber.R
+++ b/docker-resources/plumber.R
@@ -14,18 +14,24 @@ h2o.init()
 
 dbConfig <- config::get()
 
+# Update this when the model version changes
 ml_model_version <- "V7"
 
+# this is the adult model; we only load it once
 ml_model_adult <- h2o.loadModel(
   "/app/model/y0_1days_adult_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_1_20230812_150159_auc_0.775/StackedEnsemble_BestOfFamily_1_AutoML_1_20230812_150159"
 )
 
+# this is the peds model
 ml_model_minor <- h2o.loadModel(
   "/app/model/y0_1day_minor_IIT/1_StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957_auc_0.734/StackedEnsemble_BestOfFamily_1_AutoML_2_20230813_03957"
 )
 
+# here we also load the SQL script we use to extract data
 ml_sql <- read_file("/app/iit_prod_data_extract.sql")
 
+# we setup a connection pool here
+# strictly speak, a connection pool is probably overkill
 my_pool <- dbPool(
   drv = RMariaDB::MariaDB(),
   host = dbConfig$host,
@@ -44,6 +50,7 @@ function(pr) {
     })
 }
 
+# this just ensures that the API always responds with the headers needed for to avoid CORS errors
 #* @filter cors
 cors <- function(req, res) {
   res$setHeader("Access-Control-Allow-Origin", "*")
@@ -62,6 +69,8 @@ cors <- function(req, res) {
   }
 }
 
+# This is the actual endpoint definition and the place the code really starts
+
 #* @apiTitle AMPATH Interruption in Treatment Prediction Model API
 #* @apiDescription This API provides a simple method to run the model for a set of weeks
 
@@ -72,11 +81,15 @@ cors <- function(req, res) {
 #* @serializer json
 #* @post /predict
 function(
-  startDate = NA,
-  weeks = "1",
-  retrospective = "F"
+  startDate = NA,       # the startDate is the first day to start from
+                        # note that it will be adjusted to the Monday of the week its in as we always run in weekly batches
+  weeks = "1",          # the number of weeks to run; this is only used for testing
+  retrospective = "F"   # whether or not the query is retrospective (run against past data for testing) or prospective
+                        # (run normally); this mostly adjusts the query
 ) {
   retrospective <- as.logical(retrospective)
+
+  # If the startDate is not specified, it defaults to NA and we set it to a week from today
   if (is.na(startDate)) {
     startDate = clock::add_weeks(Sys.Date(), 1)
   }
@@ -92,6 +105,7 @@ function(
   # of the resuling week; this is our end date
   end_of_week <- week_end(clock::add_weeks(start_of_week, num_weeks))
 
+  # here we plug the variables into the query
   query <- DBI::sqlInterpolate(
     my_pool,
     ml_sql,
@@ -109,6 +123,7 @@ function(
   }
 
   # convert the dataframe to an h2o object, removing elements that are not predictors
+  # these also split the population by age
   h2o_predict_frame_adults <- predictors %>%
     filter(Age >= 18) %>%
     select(-c(person_id, encounter_id, location_id)) %>%
@@ -124,15 +139,20 @@ function(
   # run the predictions
   # TODO Why does this seem to claim we're running in train / validate mode?
   results_adults <- h2o.predict(ml_model_adult, h2o_predict_frame_adults)
-  on.exit(h2o.rm(results_adults))
+  on.exit(h2o.rm(results_adults))   # on.exit for clean-up
+
   results_minors <- h2o.predict(ml_model_minor, h2o_predict_frame_minors)
-  on.exit(h2o.rm(results_minors))
+  on.exit(h2o.rm(results_minors))   # on.exit for clean-up
 
+  # casting here ensures that these objects are copied as data frames,
+  # which makes things easier since most libraries can't interoperate with
+  # an H2OFrame
   results_adults_df <- as.data.frame(results_adults)
   results_minors_df <- as.data.frame(results_minors)
 
   # for the case where we need this, it should be safe to assume
   # that the start week has the correct values
+  # a cohort is the predictions generated for a given week
   cohort <- clock::date_format(clock::add_weeks(start_date, -1), format="%Y-W%U")
 
   # enrich the table of predictors with the results
@@ -170,6 +190,7 @@ function(
       .keep = "unused"
     )
 
+  # combine adult and peds results into one big frame
   prediction_result <- bind_rows(prediction_results_adults, prediction_results_minors)
 
   # add the rows from the prediction_result to the ml_weekly_predictions table
@@ -193,17 +214,24 @@ week_start <- function(date) {
   clock::date_floor(date, "week", origin = as.Date("1970-01-05"))
 }
 
+# sets origin to the first Sunday after 1970-01-01; this should guarantee that our
+# ceiling is the Sunday of the specified date
 week_end <- function(date) {
   date <- as.Date(date)
   clock::date_ceiling(date, "week", origin = as.Date("1970-01-04"))
 }
 
+# calculates the "week number" string for the week before the start date
 get_week_number <- function(date) {
   previous_week <- clock::add_weeks(week_start(date), -1)
   ywd <- clock::as_iso_year_week_day(previous_week)
   paste0(clock::get_year(ywd), "-W", stringr::str_pad(clock::get_week(ywd), 2, pad = "0"))
 }
 
+# embedded SQL queries
+# because the predictions are generated on Monday and then run on other days to catch newly added appointments
+# but we want the thresholds to remain roughly the same, we use these queries to determine what the threshold
+# was for this week to be considered "High Risk" or "Medium Risk"
 adult_risk_threshold_query <-
   "select
     'Medium Risk' as risk,
@@ -250,9 +278,10 @@ minor_risk_threshold_query <-
     and timestampdiff(YEAR, p.birthdate, mlp.rtc_date) < 18
   group by location_id;"
 
+# this is a utility function that mostly handles the risk thresholding
 predict_risk <- function(.data, cohort, age_category) {
   # arbitrary cut-off, but we expect one big batch per week
-  # and several small batches
+  # and several small batches; small batches are handled by this if
   if (nrow(.data) < 200) {
     cutoffs <- DBI::dbGetQuery(
       my_pool,
@@ -297,6 +326,9 @@ predict_risk <- function(.data, cohort, age_category) {
     }
   }
 
+  # for large batches, we calculate the thresholds from the predictions themselves
+  # the scoring system is that the 90th percentile of risk score are "High Risk" and the 80th percentile are "Medium Risk"
+  # we also break this down by location, so every location should have about 20% of its weekly visits flagged
   .data %>%
     group_by(location_id) %>%
     mutate(
diff --git a/docker-resources/run_daily_stored_procedures.sh b/docker-resources/run_daily_stored_procedures.sh
index 7ee8c97..593d231 100755
--- a/docker-resources/run_daily_stored_procedures.sh
+++ b/docker-resources/run_daily_stored_procedures.sh
@@ -1,3 +1,5 @@
 #!/bin/bash
+# this is a really simple script but it's easier to run a Bash script from cron
+# and R from the Bash script than running this whole setup from cron
 cd /app
 Rscript dailyStoredProcedures.R
diff --git a/docker-resources/run_predictions.sh b/docker-resources/run_predictions.sh
index 641d5d9..26d3372 100644
--- a/docker-resources/run_predictions.sh
+++ b/docker-resources/run_predictions.sh
@@ -1,2 +1,4 @@
 #!/bin/bash
+
+# the defaults for the API are setup for this, so we just hit the end point
 curl -X POST http://localhost:8000/predict