Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

# Natural Language Toolkit: Word List Corpus Reader 

# 

# Copyright (C) 2001-2012 NLTK Project 

# Author: Steven Bird <sb@ldc.upenn.edu> 

#         Edward Loper <edloper@gradient.cis.upenn.edu> 

# URL: <http://www.nltk.org/> 

# For license information, see LICENSE.TXT 

 

from nltk import compat 

from nltk.tokenize import line_tokenize 

 

from .util import * 

from .api import * 

 

class WordListCorpusReader(CorpusReader): 

    """ 

    List of words, one per line.  Blank lines are ignored. 

    """ 

    def words(self, fileids=None): 

        return line_tokenize(self.raw(fileids)) 

 

    def raw(self, fileids=None): 

        if fileids is None: fileids = self._fileids 

        elif isinstance(fileids, compat.string_types): fileids = [fileids] 

        return concat([self.open(f).read() for f in fileids]) 

 

 

class SwadeshCorpusReader(WordListCorpusReader): 

    def entries(self, fileids=None): 

        """ 

        :return: a tuple of words for the specified fileids. 

        """ 

        if not fileids: 

            fileids = self.fileids() 

 

        wordlists = [self.words(f) for f in fileids] 

        return zip(*wordlists)