Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Oct 16, 2023
1 parent 4bd4633 commit b9c8321
Show file tree
Hide file tree
Showing 7 changed files with 0 additions and 18 deletions.
1 change: 0 additions & 1 deletion src/applications/danews/dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def __extract_is_duplicate(mask):
def main(
path,
) -> None:

deduper = Deduper()

msg.info("Loading Dataset")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ def dns_filter(


if __name__ == "__main__":

path = os.path.join("/work/netarkivet-cleaned/safe_search_domains.json")

save_path = os.path.join("/work/netarkivet-cleaned/safe_search_domains_safe.pkl")
Expand Down
3 changes: 0 additions & 3 deletions src/dfm/cleaning/clean_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ def apply_quality_filter(batch: dict, cfg: DictConfig) -> dict:
qf = create_quality_filter(cfg)

if cfg.save_meta_data:

valid_langs = set(cfg.valid_languages)
if valid_langs:

Expand Down Expand Up @@ -189,7 +188,6 @@ def apply_sentence_filter(batch: dict, cfg: DictConfig) -> dict:
sf = create_sentence_filter(cfg)

if cfg.save_meta_data:

valid_langs = set(cfg.valid_languages)

if valid_langs:
Expand Down Expand Up @@ -284,7 +282,6 @@ def process_files(path: Path, cfg: DictConfig) -> None:
dataset.filter(lambda example: example[cfg.lang_col] in valid_langs)

if cfg.apply_sentence_filter:

dataset = dataset.map(
lambda batch: apply_sentence_filter(batch, cfg),
batched=True,
Expand Down
6 changes: 0 additions & 6 deletions src/dfm/cleaning/deduper.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,6 @@ def _deduplicate( # noqa: C901
# Otherwise we check if the corpus is an iterable of dictionaries, in
# which case we also convert it to an iterable of tuples
else:

# extract the first element of the corpus
corpus = iter(corpus)
sample = next(corpus)
Expand Down Expand Up @@ -470,10 +469,8 @@ def _deduplicate( # noqa: C901
leave=False,
)
with tqdm(batches, **pbar_params) as pbar:

# Initialise the multiprocessing
with Parallel(n_jobs=self.n_jobs) as parallel:

# Define the function that will be called in parallel
fn = delayed(
partial(
Expand All @@ -489,7 +486,6 @@ def _deduplicate( # noqa: C901

# Iterate over the batches
for batch in pbar:

# Create a copy of the batch to ensure that we're not
# modifying the original
batch, batch_copy = it.tee(batch)
Expand All @@ -514,13 +510,11 @@ def _deduplicate( # noqa: C901
pbar_params["desc"] = "Deduplicating batch"
with tqdm(batch_copy, **pbar_params) as batch_pbar:
for (idx, doc), minhash in zip(batch_pbar, minhashes):

# If the document is not a near-duplicate candidate
# then store in the LSH cache and append it to the
# JSONL output file
candidates = self.lsh_cache.query(minhash)
if len(candidates) == 0:

# Insert the document into the LSH cache
self.lsh_cache.insert(idx, minhash)

Expand Down
2 changes: 0 additions & 2 deletions src/dfm/cleaning/quality_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ def __init__(
short_long_sentence_length_split: int = 30,
short_long_sentence_threshold: float = 0.5,
):

__available_language_detection_tools = ["langdetect", "luga"]

if language_detection_tool not in __available_language_detection_tools:
Expand Down Expand Up @@ -802,7 +801,6 @@ def duplicate_ngram_fraction_filter(

for i, _ in enumerate(doc):
for ngram_size in range(lower, upper + 1):

min_, max_ = minmax[ngram_size]
end = i + ngram_size

Expand Down
3 changes: 0 additions & 3 deletions src/dfm/cleaning/sentence_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def __init__(
curly_brackets_threshold: int = 2,
n_jobs: int = -1,
):

# Store arguments as attributes
self.title_cased_words_threshold = title_cased_words_threshold
self.min_num_words = min_num_words
Expand Down Expand Up @@ -186,7 +185,6 @@ def filter_sample(
yield filter_sample(doc)
else:
with Parallel(n_jobs=n_jobs, backend="threading") as parallel:

# Set up iterator, depending on whether we have a progress bar or not
if progress_bar:
itr = tqdm(docs, desc="Filtering corpus", total=total)
Expand Down Expand Up @@ -233,7 +231,6 @@ def apply_filters(self, doc: str) -> Union[str, None]:
"""
# Iterate over all the filter functions
for filter_name, filter_fn in self.filters.items():

# Apply the filter function, which returns True if the document satisfied
# the filter, and False if it didn't
satisfied_filter = filter_fn(doc)
Expand Down
2 changes: 0 additions & 2 deletions tests/cleaning/deduper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ def deduper(self, **kwargs):
return Deduper(**dict(default_test_args, **kwargs))

def dedup(self, corpus, **kwargs):

# Add a document ID to the corpus, if it isn't there already
if isinstance(corpus, list) and isinstance(corpus[0], str):
corpus = list(enumerate(corpus))
Expand Down Expand Up @@ -212,7 +211,6 @@ def test_load_from_disk(self, minhash_params):
corpus = ["hej med dig min ven", "hej med dig min ven", "farvel du gamle"]
corpus = list(enumerate(corpus))
with tempfile.TemporaryDirectory() as temp:

# Create a deduper loaded from disk, and a different new one
deduper = self.deduper(split_method="paragraph")
deduper.deduplicate(corpus, output_dir=temp, overwrite=True)
Expand Down

0 comments on commit b9c8321

Please sign in to comment.