[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
centre-for-humanities-computing · Oct 16, 2023 · b9c8321 · b9c8321
1 parent 4bd4633
commit b9c8321
Show file tree

Hide file tree

Showing 7 changed files with 0 additions and 18 deletions.
diff --git a/src/applications/danews/dedupe.py b/src/applications/danews/dedupe.py
@@ -54,7 +54,6 @@ def __extract_is_duplicate(mask):
 def main(
     path,
 ) -> None:
-
     deduper = Deduper()
 
     msg.info("Loading Dataset")

diff --git a/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py b/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py
@@ -254,7 +254,6 @@ def dns_filter(
 
 
 if __name__ == "__main__":
-
     path = os.path.join("/work/netarkivet-cleaned/safe_search_domains.json")
 
     save_path = os.path.join("/work/netarkivet-cleaned/safe_search_domains_safe.pkl")

diff --git a/src/dfm/cleaning/clean_cli.py b/src/dfm/cleaning/clean_cli.py
@@ -120,7 +120,6 @@ def apply_quality_filter(batch: dict, cfg: DictConfig) -> dict:
     qf = create_quality_filter(cfg)
 
     if cfg.save_meta_data:
-
         valid_langs = set(cfg.valid_languages)
         if valid_langs:
 
@@ -189,7 +188,6 @@ def apply_sentence_filter(batch: dict, cfg: DictConfig) -> dict:
     sf = create_sentence_filter(cfg)
 
     if cfg.save_meta_data:
-
         valid_langs = set(cfg.valid_languages)
 
         if valid_langs:
@@ -284,7 +282,6 @@ def process_files(path: Path, cfg: DictConfig) -> None:
             dataset.filter(lambda example: example[cfg.lang_col] in valid_langs)
 
     if cfg.apply_sentence_filter:
-
         dataset = dataset.map(
             lambda batch: apply_sentence_filter(batch, cfg),
             batched=True,

diff --git a/src/dfm/cleaning/deduper.py b/src/dfm/cleaning/deduper.py
@@ -410,7 +410,6 @@ def _deduplicate(  # noqa: C901
         # Otherwise we check if the corpus is an iterable of dictionaries, in
         # which case we also convert it to an iterable of tuples
         else:
-
             # extract the first element of the corpus
             corpus = iter(corpus)
             sample = next(corpus)
@@ -470,10 +469,8 @@ def _deduplicate(  # noqa: C901
             leave=False,
         )
         with tqdm(batches, **pbar_params) as pbar:
-
             # Initialise the multiprocessing
             with Parallel(n_jobs=self.n_jobs) as parallel:
-
                 # Define the function that will be called in parallel
                 fn = delayed(
                     partial(
@@ -489,7 +486,6 @@ def _deduplicate(  # noqa: C901
 
                 # Iterate over the batches
                 for batch in pbar:
-
                     # Create a copy of the batch to ensure that we're not
                     # modifying the original
                     batch, batch_copy = it.tee(batch)
@@ -514,13 +510,11 @@ def _deduplicate(  # noqa: C901
                     pbar_params["desc"] = "Deduplicating batch"
                     with tqdm(batch_copy, **pbar_params) as batch_pbar:
                         for (idx, doc), minhash in zip(batch_pbar, minhashes):
-
                             # If the document is not a near-duplicate candidate
                             # then store in the LSH cache and append it to the
                             # JSONL output file
                             candidates = self.lsh_cache.query(minhash)
                             if len(candidates) == 0:
-
                                 # Insert the document into the LSH cache
                                 self.lsh_cache.insert(idx, minhash)
 

diff --git a/src/dfm/cleaning/quality_filter.py b/src/dfm/cleaning/quality_filter.py
@@ -272,7 +272,6 @@ def __init__(
         short_long_sentence_length_split: int = 30,
         short_long_sentence_threshold: float = 0.5,
     ):
-
         __available_language_detection_tools = ["langdetect", "luga"]
 
         if language_detection_tool not in __available_language_detection_tools:
@@ -802,7 +801,6 @@ def duplicate_ngram_fraction_filter(
 
         for i, _ in enumerate(doc):
             for ngram_size in range(lower, upper + 1):
-
                 min_, max_ = minmax[ngram_size]
                 end = i + ngram_size
 

diff --git a/src/dfm/cleaning/sentence_filter.py b/src/dfm/cleaning/sentence_filter.py
@@ -72,7 +72,6 @@ def __init__(
         curly_brackets_threshold: int = 2,
         n_jobs: int = -1,
     ):
-
         # Store arguments as attributes
         self.title_cased_words_threshold = title_cased_words_threshold
         self.min_num_words = min_num_words
@@ -186,7 +185,6 @@ def filter_sample(
                 yield filter_sample(doc)
         else:
             with Parallel(n_jobs=n_jobs, backend="threading") as parallel:
-
                 # Set up iterator, depending on whether we have a progress bar or not
                 if progress_bar:
                     itr = tqdm(docs, desc="Filtering corpus", total=total)
@@ -233,7 +231,6 @@ def apply_filters(self, doc: str) -> Union[str, None]:
         """
         # Iterate over all the filter functions
         for filter_name, filter_fn in self.filters.items():
-
             # Apply the filter function, which returns True if the document satisfied
             # the filter, and False if it didn't
             satisfied_filter = filter_fn(doc)

diff --git a/tests/cleaning/deduper_test.py b/tests/cleaning/deduper_test.py
@@ -60,7 +60,6 @@ def deduper(self, **kwargs):
         return Deduper(**dict(default_test_args, **kwargs))
 
     def dedup(self, corpus, **kwargs):
-
         # Add a document ID to the corpus, if it isn't there already
         if isinstance(corpus, list) and isinstance(corpus[0], str):
             corpus = list(enumerate(corpus))
@@ -212,7 +211,6 @@ def test_load_from_disk(self, minhash_params):
         corpus = ["hej med dig min ven", "hej med dig min ven", "farvel du gamle"]
         corpus = list(enumerate(corpus))
         with tempfile.TemporaryDirectory() as temp:
-
             # Create a deduper loaded from disk, and a different new one
             deduper = self.deduper(split_method="paragraph")
             deduper.deduplicate(corpus, output_dir=temp, overwrite=True)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -254,7 +254,6 @@ def dns_filter(


		if __name__ == "__main__":

		path = os.path.join("/work/netarkivet-cleaned/safe_search_domains.json")

		save_path = os.path.join("/work/netarkivet-cleaned/safe_search_domains_safe.pkl")
Expand Down