import nltk from nltk.corpus import conll2000 class BigramChunker(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.UnigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word,pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)] return nltk.chunk.conlltags2tree(conlltags) #Main test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP']) ##unigram_chunker = UnigramChunker(train_sents) ##print unigram_chunker.evaluate(test_sents) bigram_chunker = BigramChunker(train_sents) print bigram_chunker.evaluate(test_sents)