Source code for nn_search2.pos_tagger

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Standalone POS-tagger using NLTK's Averaged Perceptron.
"""
import argparse
import codecs
import os
import re
import sys
from string import punctuation
import unicodedata
import nltk
from nltk.tag.perceptron import PerceptronTagger


[docs]def read_file(fpath): """ Read the specified file. """ try: with codecs.open(fpath, 'r', encoding='utf-8') as fopened: fdata = normalize_text(fopened.read()) except (OSError, IOError): print 'Can not process "{0}", skipping'.format(fpath) return None return fdata
[docs]def write_file(out_path, tagged_text): """ Write the results of processing to file. Args: | *out_path* (str) -- output file path | *tagged_text* (str) -- pos-tagged data """ try: with codecs.open(out_path, 'w', encoding='utf-8') as fnew: fnew.write(tagged_text) except (OSError, IOError): print 'Can not write "{0}"!\n'.format(out_path) +\ 'Make sure there is enough free space on disk.' sys.exit(1)
[docs]def normalize_text(text): """ Remove non-utf8 characters. Convert text to ascii. <If you throw some utf-8 text to English POS-tagger, it might fail because even some English texts might contain weird chars, accents and diacritics.> Args: *chars* (str) -- strings of characters Returns: *ascii_text* (str) -- text converted to ascii """ # removing some non-utf8 chars utext = re.sub(r'[^\x00-\x7F]+', '', text) try: utext = unicode(text, 'utf-8') except TypeError as err: # file is already unicode print err # converting to ascii ascii_text = unicodedata.normalize('NFKD', utext).encode('ascii', 'ignore') return ascii_text
[docs]def tag(text): """ Process given text with PerceptronTagger. Args: | *text* (str) -- raw text data Returns: | *full_text* (str) -- tagged text """ tagger = PerceptronTagger() tagset = None sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents_tokenized = sent_detector.tokenize(text) tokenized = [] for sent in sents_tokenized: tokenized.append(nltk.tokenize.word_tokenize(sent, language='english')) tagged_text = '' for sent_toks in tokenized: pos_text = nltk.tag._pos_tag(sent_toks, None, tagger) joined_tags = ['_'.join([pos[0], 'PUNC' if pos[1] in punctuation else pos[1]]) for pos in pos_text] tagged_text = '\n'.join([tagged_text, ' '.join(joined_tags)]) return tagged_text.lstrip('\n')
[docs]def main(args, ui_call=False): """ Create directories and save the results. Handle given arguments accordingly. Args: | *ui_call* (bool) -- True if main called from withing UI | *in_file_data* (dict) -- dict of type {fname: 'file data'} | *in_dir_data* (dict) -- dict of type {fname: 'file data'} <Processing various file types in batch mode is supported only via UI. I want ``pos_tagger.py`` to have only TextBlob and nltk as dependencies.> """ in_file = None in_file_data = None in_dir = None in_dir_data = None if not ui_call: in_file = args.file in_dir = args.dir out_dir = args.out else: in_file_data, in_dir_data, out_dir = args if not os.path.exists(out_dir): os.mkdir(out_dir) # console single mode if in_file: print 'Processing "{0}"'.format(in_file) out_path = os.path.join(out_dir, 'tagged_' + os.path.basename(in_file)) data = read_file(in_file) tagged_text = tag(data) write_file(out_path, tagged_text) # ui single mode elif in_file_data: in_fname = in_file_data.keys()[0] print 'Processing "{0}"'.format(in_fname) out_path = os.path.join(out_dir, 'tagged_' + os.path.basename(in_fname)) tagged_text = tag(in_file_data[in_fname]) write_file(out_path, tagged_text) # console batch mode elif in_dir: files = [os.path.join(in_dir, fname) for fname in os.listdir(in_dir)] # only plain text files are supported in console batch mode! for text_file in files: print 'Processing "{0}"'.format(text_file) out_path = os.path.join(out_dir, 'tagged_' + os.path.basename(text_file)) data = read_file(text_file) if not data: continue tagged_text = tag(data) write_file(out_path, tagged_text) # UI batch mode elif in_dir_data: for fname, fdata in in_dir_data.items(): if not fdata: continue print 'Processing "{0}"'.format(fname) out_path = os.path.join(out_dir, 'tagged_' + os.path.basename(fname)) tagged_text = tag(fdata) write_file(out_path, tagged_text) else: print 'Please provide a directory or a filename to process!' sys.exit(1) print '\n', tagged_text, '\n' print 'The text above was successfully saved: %s' % out_path
if __name__ == '__main__': prs = argparse.ArgumentParser(description=""" Standalone POS-tagger using NLTK's Averaged Perceptron. """) prs.add_argument('-d', '--dir', help='Specify a directory with text files to process.', required=False) prs.add_argument('-f', '--file', help='Specify a text file to process.', required=False) prs.add_argument('-o', '--out', help='Specify output directory.', default=os.path.join(os.getcwd(), 'output'), required=False) arguments = prs.parse_args() main(arguments)