input_field: text filters: # The filters below define a chain of heuristic filters to be applied to each document in a corpus. # This particular cascade of filters is intended to filter generic non-English data that use spaces for separating words. # The filter listed at the top will be applied first, and the following filters will be applied in # the order they appear in this file. Each filter can be removed and re-ordered as desired. - name: nemo_curator.filters.heuristic_filter.SymbolsToWordsFilter params: max_symbol_to_word_ratio: 0.1 - name: nemo_curator.filters.heuristic_filter.NumbersFilter params: max_number_to_text_ratio: 0.15 - name: nemo_curator.filters.heuristic_filter.UrlsFilter params: max_url_to_text_ratio: 0.2 - name: nemo_curator.filters.heuristic_filter.WhiteSpaceFilter params: max_white_space_ratio: 0.25 - name: nemo_curator.filters.heuristic_filter.ParenthesesFilter params: max_parentheses_ratio: 0.1 - name: nemo_curator.filters.heuristic_filter.BoilerPlateStringFilter params: remove_if_at_top_or_bottom: True max_boilerplate_string_ratio: 0.4 - name: nemo_curator.filters.heuristic_filter.RepeatedLinesFilter params: max_repeated_line_fraction: 0.7 - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsFilter params: max_repeated_paragraphs_ratio: 0.7 - name: nemo_curator.filters.heuristic_filter.RepeatedLinesByCharFilter params: max_repeated_lines_char_ratio: 0.8 - name: nemo_curator.filters.heuristic_filter.RepeatedParagraphsByCharFilter params: max_repeated_paragraphs_char_ratio: 0.8 - name: nemo_curator.filters.heuristic_filter.WordCountFilter params: min_words: 50 max_words: 100000 # NOTE: This filter tends to remove many documents and will need to # be tuned per language - name: nemo_curator.filters.heuristic_filter.PunctuationFilter params: max_num_sentences_without_endmark_ratio: 0.85 - name: nemo_curator.filters.heuristic_filter.MeanWordLengthFilter params: max_mean_word_length: 10 min_mean_word_length: 3 - name: nemo_curator.filters.heuristic_filter.LongWordFilter params: max_word_length: 1000 - name: nemo_curator.filters.heuristic_filter.EllipsisFilter params: max_num_lines_ending_with_ellipsis_ratio: 0.3 # Top N-Gram filters for N-grams 2, 3, and 4 - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter params: n: 2 max_repeating_ngram_ratio: 0.2 - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter params: n: 3 max_repeating_ngram_ratio: 0.18 - name: nemo_curator.filters.heuristic_filter.RepeatingTopNGramsFilter params: n: 4 max_repeating_ngram_ratio: 0.16 # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 5 max_repeating_duplicate_ngram_ratio: 0.15 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 6 max_repeating_duplicate_ngram_ratio: 0.14 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 7 max_repeating_duplicate_ngram_ratio: 0.13 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 8 max_repeating_duplicate_ngram_ratio: 0.12 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 9 max_repeating_duplicate_ngram_ratio: 0.11 - name: nemo_curator.filters.heuristic_filter.RepeatingDuplicateNGramsFilter params: n: 10 max_repeating_duplicate_ngram_ratio: 0.10 - name: nemo_curator.filters.heuristic_filter.BulletsFilter params: max_bullet_lines_ratio: 0.9