Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 1.2.7 #290

Merged
merged 32 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ef3c2eb
Updated UMM-C schema file
Apr 3, 2024
6861319
Refactor URL extraction logic in url_validator.py
rajeshpandey2053 Apr 5, 2024
14b8c64
wrote validation code for datetime format check
binni979 Apr 8, 2024
f0884da
modified get_precision function
binni979 Apr 10, 2024
4b3748b
modified get_precision file
binni979 Apr 10, 2024
67c135d
removed format as return value
binni979 Apr 12, 2024
b3ec308
clean the code
binni979 Apr 12, 2024
97dd57a
umm-c schema updated to 1.18.0
smk0033 Apr 17, 2024
7bfce69
update urlextract version and provide cache directory info in constru…
rajeshpandey2053 Apr 18, 2024
eb50fef
moved formats into constant file and get_time_function in utils file
binni979 Apr 24, 2024
20741e8
Refactor _run_function method in checker.py for better code organizat…
rajeshpandey2053 Apr 24, 2024
97b3331
Added doc strings in a function
binni979 Apr 24, 2024
c27bd69
use ThreadPoolExecutor for parallel processing in checker
rajeshpandey2053 Apr 24, 2024
befddc5
Refactor CustomChecker class to use multithreading for argument proce…
rajeshpandey2053 Apr 24, 2024
246eb9a
Merge pull request #278 from NASA-IMPACT/fix_check_url
rajeshpandey2053 Apr 24, 2024
05191f1
worked on the standard product check error
binni979 Apr 29, 2024
5d73536
Added a docstrings in standard_product_check function
binni979 Apr 29, 2024
eb421a4
fix the appropriate handling of thread outputs in customer checker
rajeshpandey2053 Apr 30, 2024
b101caf
add error handling and code organization in multi threaded code
rajeshpandey2053 Apr 30, 2024
dfd681c
Add max_worker.
xhagrg May 3, 2024
42d575b
Modified standard product check function
binni979 May 6, 2024
244411c
Merge pull request #279 from NASA-IMPACT/validate-datetime-against_gr…
xhagrg May 6, 2024
292bae7
Merge pull request #282 from NASA-IMPACT/smk_schema_update
xhagrg May 6, 2024
a0fbe59
Merge pull request #284 from NASA-IMPACT/multithreading
xhagrg May 6, 2024
eab6158
make some changes on _error_message function to solve the problem
binni979 May 7, 2024
f2287ea
worked on the one_time_presence_check function
binni979 May 9, 2024
86d8838
worked on the comment
binni979 May 9, 2024
ec915ef
Merge pull request #285 from NASA-IMPACT/standard-product-check
slesaad May 13, 2024
7c3683d
Merge pull request #288 from NASA-IMPACT/doi-link-update-check
slesaad May 13, 2024
7fa67eb
Use env variable for cache dir.
xhagrg May 28, 2024
2771414
Merge pull request #289 from NASA-IMPACT/fix-cache_dir
xhagrg May 28, 2024
122f4aa
Bump version
slesaad Jun 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 66 additions & 27 deletions pyQuARC/code/checker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json

from xmltodict import parse
from concurrent.futures import ThreadPoolExecutor, as_completed

from .custom_checker import CustomChecker
from .schema_validator import SchemaValidator
Expand Down Expand Up @@ -154,43 +155,81 @@ def _check_dependencies_validity(self, dependencies, field_dict):
return False
return True

def _process_field(
self,
func,
check,
rule_id,
metadata_content,
field_dict,
result_dict,
rule_mapping,
):
"""
Process a single field according to the given rule and update result_dict
"""
external_data = rule_mapping.get("data", [])
relation = rule_mapping.get("relation")
dependencies = self.scheduler.get_all_dependencies(
rule_mapping, check, field_dict
)
main_field = field_dict["fields"][0]
external_data = field_dict.get("data", external_data)
result_dict.setdefault(main_field, {})

if not self._check_dependencies_validity(dependencies, field_dict):
return

result = self.custom_checker.run(
func, metadata_content, field_dict, external_data, relation
)

self.tracker.update_data(rule_id, main_field, result["valid"])

# Avoid adding null valid results for rules that are not applied
if result["valid"] is None:
return

result_dict[main_field][rule_id] = result

message = self.build_message(result, rule_id)
if message:
result["message"] = message
result["remediation"] = self.message(rule_id, "remediation")

def _run_func(self, func, check, rule_id, metadata_content, result_dict):
"""
Run the check function for `rule_id` and update `result_dict`
"""
rule_mapping = self.rules_override.get(rule_id) or self.rule_mapping.get(
rule_id
)
external_data = rule_mapping.get("data", [])
relation = rule_mapping.get("relation")
list_of_fields_to_apply = rule_mapping.get("fields_to_apply").get(
self.metadata_format, {}
)

for field_dict in list_of_fields_to_apply:
dependencies = self.scheduler.get_all_dependencies(
rule_mapping, check, field_dict
)
main_field = field_dict["fields"][0]
external_data = field_dict.get("data", external_data)
result_dict.setdefault(main_field, {})
if not self._check_dependencies_validity(dependencies, field_dict):
continue
result = self.custom_checker.run(
func, metadata_content, field_dict, external_data, relation
)

self.tracker.update_data(rule_id, main_field, result["valid"])

# this is to avoid "valid" = null in the result, for rules that are not applied
if result["valid"] is None:
continue
result_dict[main_field][rule_id] = result

message = self.build_message(result, rule_id)
if message:
result["message"] = message
result["remediation"] = self.message(rule_id, "remediation")
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for field_dict in list_of_fields_to_apply:
future = executor.submit(
self._process_field,
func,
check,
rule_id,
metadata_content,
field_dict,
result_dict,
rule_mapping,
)
futures.append(future)

# Wait for all futures to complete
for future in as_completed(futures):
# Retrieve the result or raise an exception if an error occurred
try:
future.result()
except Exception as e:
# Handle the exception from the thread
raise e

def perform_custom_checks(self, metadata_content):
"""
Expand Down
10 changes: 10 additions & 0 deletions pyQuARC/code/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,13 @@
}

CMR_URL = "https://cmr.earthdata.nasa.gov"

DATE_FORMATS = [
"%Y-%m-%dT%H:%M:%S.%f", # Year to microsecond
"%Y-%m-%dT%H:%M:%S", # Year to second
"%Y-%m-%dT%H:%M", # Year to minute
"%Y-%m-%dT%H", # Year to hour
"%Y-%m-%d", # Year to day
"%Y-%m", # Year to month
"%Y", # Year
]
75 changes: 57 additions & 18 deletions pyQuARC/code/custom_checker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed


class CustomChecker:
Expand Down Expand Up @@ -103,6 +104,33 @@ def _get_path_value(content_to_validate, path_string):
)
return container

@staticmethod
def _process_argument(arg, func, relation, external_data, external_relation):
"""
Process the argument by calling the provided function with the given arguments.

Args:
arg: The argument to be processed.
func: The function to be called.
relation: The relation argument.
external_data: The external data argument.
external_relation: The external relation argument.

Returns:
A dict containing the updated invalid_values list and the updated validity flag.
"""

function_args = [*arg]
function_args.extend(
[
extra_arg
for extra_arg in [relation, *external_data, external_relation]
if extra_arg
]
)
func_return = func(*function_args)
return func_return

def run(
self, func, content_to_validate, field_dict, external_data, external_relation
):
Expand Down Expand Up @@ -137,24 +165,35 @@ def run(

invalid_values = []
validity = None
for arg in args:
function_args = [*arg]
function_args.extend(
[
extra_arg
for extra_arg in [relation, *external_data, external_relation]
if extra_arg
]
)
func_return = func(*function_args)
valid = func_return["valid"] # can be True, False or None
if valid is not None:
if valid:
validity = validity or (validity is None)
else:
if "value" in func_return:
invalid_values.append(func_return["value"])
validity = False

# Process arguments using multithreading
with ThreadPoolExecutor() as executor:
future_results = []
for arg in args:
future = executor.submit(
self._process_argument,
arg,
func,
relation,
external_data,
external_relation,
)
future_results.append(future)

# Retrieve results from futures
for future in as_completed(future_results):
try:
func_return = future.result()
valid = func_return["valid"] # can be True, False or None
if valid is not None:
if valid:
validity = validity or (validity is None)
else:
if "value" in func_return:
invalid_values.append(func_return["value"])
validity = False
except Exception as e:
raise e
result["valid"] = validity
result["value"] = invalid_values
return result
6 changes: 3 additions & 3 deletions pyQuARC/code/custom_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,13 @@ def one_item_presence_check(*field_values):
value = None

for field_value in field_values:
if field_value:
if field_value is not None:
value = field_value
validity = True
break

return {"valid": validity, "value": value}

@staticmethod
def dif_standard_product_check(*field_values):
"""
Expand Down Expand Up @@ -130,7 +130,7 @@ def license_url_description_check(description_field, url_field, license_text):
description_field (string): string describing the URL
"""
validity = True
value = description_field
value = description_field

if not license_text and not url_field:
validity = False
Expand Down
14 changes: 9 additions & 5 deletions pyQuARC/code/datetime_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime

from .base_validator import BaseValidator
from .utils import cmr_request, if_arg, set_cmr_prms
from .utils import cmr_request, if_arg, set_cmr_prms, get_date_time


class DatetimeValidator(BaseValidator):
Expand Down Expand Up @@ -117,13 +117,13 @@ def compare(first, second, relation):

@staticmethod
def validate_datetime_against_granules(
datetime, collection_shortname, version, sort_key, time_key
datetime_string, collection_shortname, version, sort_key, time_key
):
"""
Validates the collection datetime against the datetime of the last granule in the collection

Args:
datetime (str): datetime string
datetime_string (str): datetime string
collection_shortname (str): ShortName of the parent collection
sort_key (str): choice of start_date and end_date
time_key (str): choice of time_end and time_start
Expand All @@ -143,13 +143,17 @@ def validate_datetime_against_granules(

validity = True
last_granule_datetime = None
date_time = None

# Compare the precision of the two datetime strings
if len(granules["feed"]["entry"]) > 0:
last_granule = granules["feed"]["entry"][0]
last_granule_datetime = last_granule.get(time_key)
validity = datetime == last_granule_datetime
date_time = get_date_time(datetime_string)
last_granule_datetime = get_date_time(last_granule_datetime)
validity = date_time == last_granule_datetime

return {"valid": validity, "value": (datetime, last_granule_datetime)}
return {"valid": validity, "value": (date_time, last_granule_datetime)}

@staticmethod
@if_arg
Expand Down
5 changes: 3 additions & 2 deletions pyQuARC/code/url_validator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import requests

from urlextract import URLExtract
Expand Down Expand Up @@ -54,7 +55,7 @@ def status_code_from_request(url):
validity = True

# extract URLs from text
extractor = URLExtract()
extractor = URLExtract(cache_dir=os.environ.get("CACHE_DIR"))
urls = extractor.find_urls(text_with_urls)
urls.extend(UrlValidator._extract_http_texts(text_with_urls))

Expand Down Expand Up @@ -115,4 +116,4 @@ def doi_link_update(value, bad_urls):
if value in bad_urls:
validity = False

return {"valid": validity, "Value": value}
return {"valid": validity, "value": value}
20 changes: 19 additions & 1 deletion pyQuARC/code/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import requests
import urllib
from datetime import datetime

from functools import wraps

from .constants import CMR_URL
from .constants import CMR_URL, DATE_FORMATS


def if_arg(func):
Expand Down Expand Up @@ -64,3 +65,20 @@ def cmr_request(cmr_prms):

def collection_in_cmr(cmr_prms):
return cmr_request(cmr_prms).get("hits", 0) > 0


def get_date_time(dt_str):
"""
Convert a date and time string to a datetime object using predefined formats.
This function attempts to parse a date and time string (`dt_str`) into a `datetime` object.
It iterates over a list of possible date and time formats (`DATE_FORMATS`). The first successful
parse using one of these formats will result in returning the corresponding `datetime` object.
If none of the formats match, the function returns `None`.
"""
for fmt in DATE_FORMATS:
try:
date_time = datetime.strptime(dt_str, fmt)
return date_time
except ValueError:
continue
return None
Loading
Loading