diff --git a/src/applications/danews/dedupe.py b/src/applications/danews/dedupe.py index d07d5a89..24095325 100644 --- a/src/applications/danews/dedupe.py +++ b/src/applications/danews/dedupe.py @@ -54,7 +54,6 @@ def __extract_is_duplicate(mask): def main( path, ) -> None: - deduper = Deduper() msg.info("Loading Dataset") diff --git a/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py b/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py index 2ef87a91..8441e2f8 100644 --- a/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py +++ b/src/applications/netarkivet/cleaning-v1/content_filtering/DNS_filter.py @@ -254,7 +254,6 @@ def dns_filter( if __name__ == "__main__": - path = os.path.join("/work/netarkivet-cleaned/safe_search_domains.json") save_path = os.path.join("/work/netarkivet-cleaned/safe_search_domains_safe.pkl") diff --git a/src/dfm/cleaning/clean_cli.py b/src/dfm/cleaning/clean_cli.py index 581294c5..535dc29f 100644 --- a/src/dfm/cleaning/clean_cli.py +++ b/src/dfm/cleaning/clean_cli.py @@ -120,7 +120,6 @@ def apply_quality_filter(batch: dict, cfg: DictConfig) -> dict: qf = create_quality_filter(cfg) if cfg.save_meta_data: - valid_langs = set(cfg.valid_languages) if valid_langs: @@ -189,7 +188,6 @@ def apply_sentence_filter(batch: dict, cfg: DictConfig) -> dict: sf = create_sentence_filter(cfg) if cfg.save_meta_data: - valid_langs = set(cfg.valid_languages) if valid_langs: @@ -284,7 +282,6 @@ def process_files(path: Path, cfg: DictConfig) -> None: dataset.filter(lambda example: example[cfg.lang_col] in valid_langs) if cfg.apply_sentence_filter: - dataset = dataset.map( lambda batch: apply_sentence_filter(batch, cfg), batched=True, diff --git a/src/dfm/cleaning/deduper.py b/src/dfm/cleaning/deduper.py index c80ed7aa..a69f427c 100644 --- a/src/dfm/cleaning/deduper.py +++ b/src/dfm/cleaning/deduper.py @@ -410,7 +410,6 @@ def _deduplicate( # noqa: C901 # Otherwise we check if the corpus is an iterable of dictionaries, in # which case we also convert it to an iterable of tuples else: - # extract the first element of the corpus corpus = iter(corpus) sample = next(corpus) @@ -470,10 +469,8 @@ def _deduplicate( # noqa: C901 leave=False, ) with tqdm(batches, **pbar_params) as pbar: - # Initialise the multiprocessing with Parallel(n_jobs=self.n_jobs) as parallel: - # Define the function that will be called in parallel fn = delayed( partial( @@ -489,7 +486,6 @@ def _deduplicate( # noqa: C901 # Iterate over the batches for batch in pbar: - # Create a copy of the batch to ensure that we're not # modifying the original batch, batch_copy = it.tee(batch) @@ -514,13 +510,11 @@ def _deduplicate( # noqa: C901 pbar_params["desc"] = "Deduplicating batch" with tqdm(batch_copy, **pbar_params) as batch_pbar: for (idx, doc), minhash in zip(batch_pbar, minhashes): - # If the document is not a near-duplicate candidate # then store in the LSH cache and append it to the # JSONL output file candidates = self.lsh_cache.query(minhash) if len(candidates) == 0: - # Insert the document into the LSH cache self.lsh_cache.insert(idx, minhash) diff --git a/src/dfm/cleaning/quality_filter.py b/src/dfm/cleaning/quality_filter.py index 91f10db6..f4884574 100644 --- a/src/dfm/cleaning/quality_filter.py +++ b/src/dfm/cleaning/quality_filter.py @@ -272,7 +272,6 @@ def __init__( short_long_sentence_length_split: int = 30, short_long_sentence_threshold: float = 0.5, ): - __available_language_detection_tools = ["langdetect", "luga"] if language_detection_tool not in __available_language_detection_tools: @@ -802,7 +801,6 @@ def duplicate_ngram_fraction_filter( for i, _ in enumerate(doc): for ngram_size in range(lower, upper + 1): - min_, max_ = minmax[ngram_size] end = i + ngram_size diff --git a/src/dfm/cleaning/sentence_filter.py b/src/dfm/cleaning/sentence_filter.py index d3f0a2f0..51c3ba3e 100644 --- a/src/dfm/cleaning/sentence_filter.py +++ b/src/dfm/cleaning/sentence_filter.py @@ -72,7 +72,6 @@ def __init__( curly_brackets_threshold: int = 2, n_jobs: int = -1, ): - # Store arguments as attributes self.title_cased_words_threshold = title_cased_words_threshold self.min_num_words = min_num_words @@ -186,7 +185,6 @@ def filter_sample( yield filter_sample(doc) else: with Parallel(n_jobs=n_jobs, backend="threading") as parallel: - # Set up iterator, depending on whether we have a progress bar or not if progress_bar: itr = tqdm(docs, desc="Filtering corpus", total=total) @@ -233,7 +231,6 @@ def apply_filters(self, doc: str) -> Union[str, None]: """ # Iterate over all the filter functions for filter_name, filter_fn in self.filters.items(): - # Apply the filter function, which returns True if the document satisfied # the filter, and False if it didn't satisfied_filter = filter_fn(doc) diff --git a/tests/cleaning/deduper_test.py b/tests/cleaning/deduper_test.py index 39ab5643..cf666ee2 100644 --- a/tests/cleaning/deduper_test.py +++ b/tests/cleaning/deduper_test.py @@ -60,7 +60,6 @@ def deduper(self, **kwargs): return Deduper(**dict(default_test_args, **kwargs)) def dedup(self, corpus, **kwargs): - # Add a document ID to the corpus, if it isn't there already if isinstance(corpus, list) and isinstance(corpus[0], str): corpus = list(enumerate(corpus)) @@ -212,7 +211,6 @@ def test_load_from_disk(self, minhash_params): corpus = ["hej med dig min ven", "hej med dig min ven", "farvel du gamle"] corpus = list(enumerate(corpus)) with tempfile.TemporaryDirectory() as temp: - # Create a deduper loaded from disk, and a different new one deduper = self.deduper(split_method="paragraph") deduper.deduplicate(corpus, output_dir=temp, overwrite=True)