Merge pull request #202 from cancervariants/staging

Staging
cancervariants · Nov 11, 2022 · 7dc3e94 · 7dc3e94
2 parents 2f9b644 + 6f8f685
commit 7dc3e94
Show file tree

Hide file tree

Showing 37 changed files with 5,191 additions and 5,848 deletions.
diff --git a/.ebextensions/02_app_config.config b/.ebextensions/02_app_config.config
@@ -5,7 +5,9 @@ commands:
         command: "yum install -y awscli"
     03_install_unzip:
         command: "yum install -y unzip"
-    04_export_eb_env_var:
+    04_eb_packages:
+       command: "/var/app/venv/staging-LQM1lest/bin/pip install uvloop websockets httptools typing-extensions"
+    05_export_eb_env_var:
         command: "export $(cat /opt/elasticbeanstalk/deployment/env | xargs)"
 
 container_commands:

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -9,8 +9,8 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
     - name: Install dependencies
       run: |
         python3 -m pip install --upgrade pip

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,5 +9,4 @@ repos:
     - id: check-added-large-files
       args: ['--maxkb=1024']
       exclude: ^tests/data
-    - id: detect-aws-credentials
     - id: detect-private-key
diff --git a/Pipfile b/Pipfile
@@ -4,26 +4,22 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-"ga4gh.vrs" = {version = ">=0.7.5.dev1", extras = ["extras"]}
-civicpy = "*"
+"ga4gh.vrs" = "==0.8.0dev0"
+civicpy = ">=2.0.0"
 requests = "*"
 jsondiff = "*"
 pydantic = "*"
 requests-cache = "*"
-gene-normalizer = ">=0.1.25"
-disease-normalizer = ">=0.2.12"
-thera-py = ">=0.3.4"
+gene-normalizer = {version = "==0.1.30", extras = ["dev"]}
+disease-normalizer = {version = "==0.2.15", extras = ["dev"]}
+thera-py = {version = "==0.3.7", extras = ["dev"]}
 neo4j = "*"
 uvicorn = "*"
 fastapi = "*"
-uvloop = "*"
-websockets = "*"
-httptools = "*"
-typing-extensions = "*"
 boto3 = "*"
 botocore = "*"
-variation-normalizer = ">= 0.4.0a7"
-"ga4gh.vrsatile.pydantic" = ">=0.0.11"
+variation-normalizer = "==0.5.1"
+"ga4gh.vrsatile.pydantic" = "==0.0.11"
 asyncclick = "*"
 
 [dev-packages]

diff --git a/README.md b/README.md
@@ -30,15 +30,13 @@ Once Pipenv is installed, clone the repo and install the package requirements in
 ```sh
 git clone https://github.com/cancervariants/metakb
 cd metakb
-pipenv lock
-pipenv sync
+pipenv lock && pipenv sync
 ```
 
 If you intend to provide development support, install the development dependencies:
 
 ```sh
-pipenv lock --dev
-pipenv sync
+pipenv lock --dev && pipenv sync
 ```
 
 ### Setting up Neo4j
@@ -49,14 +47,14 @@ First, follow the [desktop setup instructions](https://neo4j.com/developer/neo4j
 
 Once you have opened Neo4j desktop, use the "New" button in the upper-left region of the window to create a new project. Within that project, click the "Add" button in the upper-right region of the window and select "Local DBMS". The name of the DBMS doesn't matter, but the password will be used later to connect the database to MetaKB (we have been using "admin" by default). Click "Create". Then, click the row within the project screen corresponding to your newly-created DBMS, and click the green "Start" button to start the database service.
 
-The graph will initially be empty, but once you have successfully loaded data, Neo4j Desktop provides an interface for exploring and visualizing relationships within the graph. To access it, click the blue "Open" button. The prompt at the top of this window processes [Cypher queries](https://neo4j.com/docs/cypher-refcard/current/); to start, try `MATCH (n:Statement {id:"civic.eid:5818"}) RETURN n`. Buttons on the left-hand edge of the results pane let you select graph, tabular, or textual output.
+The graph will initially be empty, but once you have successfully loaded data, Neo4j Desktop provides an interface for exploring and visualizing relationships within the graph. To access it, click the blue "Open" button. The prompt at the top of this window processes [Cypher queries](https://neo4j.com/docs/cypher-refcard/current/); to start, try `MATCH (n:Statement {id:"civic.eid:1409"}) RETURN n`. Buttons on the left-hand edge of the results pane let you select graph, tabular, or textual output.
 
 
 ### Setting up normalizers
 
 The MetaKB calls a number of normalizer libraries to transform resource data and resolve incoming search queries. These will be installed as part of the package requirements, but require additional setup.
 
-First, [download and install Amazon's DynamoDB](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/DynamoDBLocal.DownloadingAndRunning.html). Once installed, in a separate terminal instance, navigate to its source directory and run the following to start the database instance:
+First, [follow these instructions for deploying DynamoDB locally on your computer](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/DynamoDBLocal.DownloadingAndRunning.html). Once setup, in a separate terminal instance, navigate to its source directory and run the following to start the database instance:
 
 ```sh
 java -Djava.library.path=./DynamoDBLocal_lib -jar DynamoDBLocal.jar -sharedDb
@@ -65,10 +63,10 @@ java -Djava.library.path=./DynamoDBLocal_lib -jar DynamoDBLocal.jar -sharedDb
 Next, navigate to the `site-packages` directory of your virtual environment. Assuming Pipenv is installed to your user directory, this should be something like:
 
 ```sh
-cd ~/.local/share/virtualenvs/metakb-<various characters>/python3.7/site-packages/  # replace <various characters>
+cd ~/.local/share/virtualenvs/metakb-<various characters>/lib/python<python-version>/site-packages/  # replace <various characters> and <python-version>
 ```
 
-Next, initialize the [Variation Normalizer](https://github.com/cancervariants/variation-normalization) by following the instructions in the [README](https://github.com/cancervariants/variation-normalization#installation).
+Next, initialize the [Variation Normalizer](https://github.com/cancervariants/variation-normalization) by following the instructions in the [README](https://github.com/cancervariants/variation-normalization#installation). When setting up the UTA database, [these](https://github.com/ga4gh/vrs-python/tree/main/docs/setup_help) docs may be helpful.
 
 
 The MetaKB can acquire all other needed normalizer data, except for that of [OMIM](https://www.omim.org/downloads), which must be manually placed:
@@ -79,9 +77,46 @@ mkdir -p data/omim
 cp ~/YOUR/PATH/TO/mimTitles.txt data/omim/omim_<date>.tsv  # replace <date> with date of data acquisition formatted as YYYYMMDD
 ```
 
+### Environment Variables
+
+MetaKB relies on environment variables to set in order to work.
+
+* Always Required:
+  * `UTA_DB_URL`
+    * Used in Variation Normalizer which relies on UTA Tools
+    * Format: `driver://user:pass@host/database/schema`
+    * More info can be found [here](https://github.com/GenomicMedLab/uta-tools#connecting-to-the-database)
+
+    Example:
+
+    ```shell script
+    export UTA_DB_URL=postgresql://uta_admin:password@localhost:5432/uta/uta_20210129
+    ```
+
+* Required when using the `--load_normalizers_db` or `--force_load_normalizers_db` arguments in CLI commands
+  * `RXNORM_API_KEY`
+    * Used in Therapy Normalizer to retrieve RxNorm data
+    * RxNorm requires a UMLS license, which you can register for one [here](https://www.nlm.nih.gov/research/umls/index.html). You must set the `RxNORM_API_KEY` environment variable to your API key. This can be found in the [UTS 'My Profile' area](https://uts.nlm.nih.gov/uts/profile) after singing in.
+
+    Example:
+
+    ```shell script
+    export RXNORM_API_KEY={rxnorm_api_key}
+    ```
+
+  * `DATAVERSE_API_KEY`
+    * Used in Therapy Normalizer to retrieve HemOnc data
+    * HemOnc.org data requires a Harvard Dataverse API key. After creating a user account on the Harvard Dataverse website, you can follow [these instructions](https://guides.dataverse.org/en/latest/user/account.html) to generate a key. You will create or login to your account at [this](https://dataverse.harvard.edu/) site. You must set the `DATAVERSE_API_KEY` environment variable to your API key.
+
+    Example:
+
+    ```shell script
+    export DATAVERSE_API_KEY={dataverse_api_key}
+    ```
+
 ### Loading data
 
-Once Neo4j and DynamoDB instances are both active, and necessary normalizer data has been placed, run the MetaKB CLI with the `--initialize_normalizers` flag to acquire all other necessary normalizer source data, and execute harvest, transform, and load operations into the graph datastore.
+Once Neo4j and DynamoDB instances are both running, and necessary normalizer data has been placed, run the MetaKB CLI with the `--initialize_normalizers` flag to acquire all other necessary normalizer source data, and execute harvest, transform, and load operations into the graph datastore.
 
 In the MetaKB project root, run the following:
 
@@ -90,6 +125,8 @@ pipenv shell
 python3 -m metakb.cli --db_url=bolt://localhost:7687 --db_username=neo4j --db_password=<neo4j-password-here> --load_normalizers_db
 ```
 
+For more information on the different CLI arguments, see the [CLI README](docs/cli/README.md).
+
 ### Starting the server
 
 Once data has been loaded successfully, use the following to start service on localhost port 8000:
@@ -98,6 +135,8 @@ Once data has been loaded successfully, use the following to start service on lo
 uvicorn metakb.main:app --reload
 ```
 
+Ensure that both the MetaKB Neo4j and Normalizers databases are running.
+
 Navigate to [http://localhost:8000/api/v2](http://localhost:8000/api/v2) in your browser to enter queries.
 
 ## Running tests

diff --git a/docs/cli/README.md b/docs/cli/README.md
@@ -0,0 +1,30 @@
+# MetaKB CLI
+
+More information on MetaKB CLI arguments
+
+* `--db_url`
+  * URL endpoint for the application Neo4j database. Can also be provided via environment variable `METAKB_DB_URL`.
+
+* `--db_username`
+  * Username to provide to application Neo4j database. Can also be provided via environment variable `METAKB_DB_USERNAME`.
+
+* `--db_password`
+  * Password to provide to application Neo4j database. Can also be provided via environment variable `METAKB_DB_PASSWORD`.
+
+* `--load_normalizers_db`
+  * Check normalizers' (therapy, disease, and gene) DynamoDB database and load data if source data is not present.
+
+* `--force_load_normalizers_db`
+  * Load all normalizers' (therapy, disease, and gene) data into DynamoDB database. Overrides `--load_normalizers_db` if both are selected.
+
+* `--normalizers_db_url`
+  * URL endpoint of normalizers' (therapy, disease, and gene) DynamoDB database. Set to `http://localhost:8000` by default.
+
+* `--load_latest_cdms`
+  * Deletes all nodes from the MetaKB Neo4j database and loads it with the latest source transformed CDM files stored locally in the `metakb/data` directory. This bypasses having to run the source harvest and transform steps. Exclusive with `--load_target_cdm` and `--load_latest_s3_cdms`.
+
+* `--load_target_cdm`
+  * Load a source's transformed CDM file at specified path. This bypasses having to run the source harvest and transform steps. Exclusive with `--load_latest_cdms` and `--load_latest_s3_cdms`.
+
+* `--load_latest_s3_cdms`
+  * Deletes all nodes from the MetaKB Neo4j database, retrieves latest source transformed CDM files from public s3 bucket, and loads the Neo4j database with the retrieved data. This bypasses having to run the source harvest and transform steps Exclusive with `--load_latest_cdms` and `--load_target_cdms`.
diff --git a/metakb/__init__.py b/metakb/__init__.py
@@ -7,10 +7,6 @@
 PROJECT_ROOT = Path(__file__).resolve().parents[1]
 
 if 'METAKB_NORM_EB_PROD' in environ:
-    environ['VARIATION_NORM_EB_PROD'] = "true"
-    environ['GENE_NORM_EB_PROD'] = "true"
-    environ['THERAPY_NORM_EB_PROD'] = "true"
-    environ['DISEASE_NORM_EB_PROD'] = "true"
     LOG_FN = "/tmp/metakb.log"
 else:
     LOG_FN = "metakb.log"

diff --git a/metakb/cli.py b/metakb/cli.py
@@ -114,11 +114,21 @@ class CLI:
               "from VICC S3 bucket, and load the database with retrieved "
               "data. Exclusive with --load_latest_cdms and load_target_cdm.")
     )
+    @click.option(
+        "--update_cached",
+        "-u",
+        is_flag=True,
+        default=False,
+        required=False,
+        help=("`True` if civicpy cache should be updated. Note this will take serveral"
+              "minutes. `False` if local cache should be used")
+    )
     async def update_metakb_db(
         db_url: str, db_username: str, db_password: str,
         load_normalizers_db: bool, force_load_normalizers_db: bool,
         normalizers_db_url: str, load_latest_cdms: bool,
-        load_target_cdm: Optional[Path], load_latest_s3_cdms: bool
+        load_target_cdm: Optional[Path], load_latest_s3_cdms: bool,
+        update_cached: bool
     ):
         """Execute data harvest and transformation from resources and upload
         to graph datastore.
@@ -141,7 +151,7 @@ async def update_metakb_db(
             if load_normalizers_db or force_load_normalizers_db:
                 CLI()._load_normalizers_db(force_load_normalizers_db)
 
-            CLI()._harvest_sources()
+            CLI()._harvest_sources(update_cached)
             await CLI()._transform_sources()
 
         # Load neo4j database
@@ -225,7 +235,7 @@ def _retrieve_s3_cdms(self) -> str:
         return newest_version
 
     @staticmethod
-    def _harvest_sources() -> None:
+    def _harvest_sources(update_cached) -> None:
         """Run harvesting procedure for all sources."""
         echo_info("Harvesting sources...")
         # TODO: Switch to using constant
@@ -238,7 +248,12 @@ def _harvest_sources() -> None:
             echo_info(f"Harvesting {source_str}...")
             start = timer()
             source: Harvester = source_class()
-            source_successful = source.harvest()
+            if source_str == "civic" and update_cached:
+                # Use latest civic data
+                echo_info("(civicpy cache is also being updated)")
+                source_successful = source.harvest(update_cache=True)
+            else:
+                source_successful = source.harvest()
             end = timer()
             if not source_successful:
                 echo_info(f'{source_str} harvest failed.')

diff --git a/metakb/database.py b/metakb/database.py
@@ -457,7 +457,7 @@ def _add_statement(tx, statement: Dict, added_ids: Set[str]):
     @staticmethod
     def get_secret():
         """Get secrets for MetaKB instances."""
-        secret_name = environ['METAKB_DB_PASSWORD']
+        secret_name = environ['METAKB_DB_SECRET']
         region_name = "us-east-2"
 
         # Create a Secrets Manager client

diff --git a/metakb/harvesters/base.py b/metakb/harvesters/base.py
@@ -6,18 +6,18 @@
 
 from metakb import APP_ROOT, DATE_FMT
 
-logger = logging.getLogger('metakb')
+logger = logging.getLogger("metakb.harvesters.base")
 logger.setLevel(logging.DEBUG)
 
 
 class Harvester:
     """A base class for content harvesters."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Initialize Harvester class."""
         self.assertions = []
 
-    def harvest(self):
+    def harvest(self) -> bool:
         """
         Retrieve and store records from a resource. Records may be stored in
         any manner, but must be retrievable by :method:`iterate_records`.
@@ -27,16 +27,6 @@ def harvest(self):
         """
         raise NotImplementedError
 
-    def iter_assertions(self):
-        """
-        Yield all :class:`ClinSigAssertion` records for the resource.
-
-        :return: An iterator
-        :rtype: Iterator[:class:`ClinSigAssertion`]
-        """
-        for statement in self.assertions:
-            yield statement
-
     def create_json(self, items: Dict[str, List],
                     filename: Optional[str] = None) -> bool:
         """Create composite and individual JSON for harvested data.
@@ -59,7 +49,7 @@ def create_json(self, items: Dict[str, List],
             if filename is None:
                 filename = f"{src}_harvester_{today}.json"
             with open(src_dir / filename, "w+") as f:
-                json.dump(composite_dict, f, indent=4)
+                f.write(json.dumps(composite_dict, indent=4))
         except Exception as e:
             logger.error(f"Unable to create json: {e}")
             return False