-
Notifications
You must be signed in to change notification settings - Fork 5
/
parser.py
executable file
·174 lines (136 loc) · 5.15 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import nltk
import sys
TERMINALS = """
Adj -> "country" | "dreadful" | "enigmatical" | "little" | "moist" | "red"
Adv -> "down" | "here" | "never"
Conj -> "and"
Det -> "a" | "an" | "his" | "my" | "the"
N -> "armchair" | "companion" | "day" | "door" | "hand" | "he" | "himself"
N -> "holmes" | "home" | "i" | "mess" | "paint" | "palm" | "pipe" | "she"
N -> "smile" | "thursday" | "walk" | "we" | "word"
P -> "at" | "before" | "in" | "of" | "on" | "to" | "until"
V -> "arrived" | "came" | "chuckled" | "had" | "lit" | "said" | "sat"
V -> "smiled" | "tell" | "were"
"""
# NONTERMINALS = """
# S -> NP VP
# S -> V
# NP -> N | Det N | P NP | Det Adj | Adj NP | Adv NP | Conj NP | Adv
# NP -> Conj NP | Conj S | P S | NP NP
# VP -> V | V NP | Adv VP
# """
# The nonterminal rules above generate more than 50 different parse trees for 9.txt
NONTERMINALS = """
S -> NP VP | VP | S Conj S | S P S
NP -> N | Det N | Det NP | P NP | Adj NP | Adv NP | Conj NP | N NP | N Adv
VP -> V | V NP | Adv VP | V Adv
"""
# S -> NP VP => A sentence composed of a noun-phrase and a verb-phrase
# S -> VP => A sentence with only a verb phrase e.g. "Came home"
# S -> S Conj S => A complex sentence with simpler ones connected with Conj
# S -> S P S => Prepositions like 'until' may also join simpler sentences
# NP -> N => A noun for a subject or object
# NP -> Det N or Det NP => A noun or noun phrase with a determiner (the, a, his)
# NP -> (?) NP => A noun phrase may follow other parts of speech
# NP -> N Adv => A noun may be followed by an adverb e.g. "...mess here"
# VP -> V => Could just be a verb
# VP -> V NP => Could be a verb followed by an object (S + V + O)
# VP -> Adv VP or V Adv => A verb may be described by an adverb before or after
grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)
def main():
# If filename specified, read sentence from file
if len(sys.argv) == 2:
with open(sys.argv[1]) as f:
s = f.read()
# Otherwise, get sentence as input
else:
s = input("Sentence: ")
# Convert input into list of words
s = preprocess(s)
# Attempt to parse sentence
try:
trees = list(parser.parse(s))
except ValueError as e:
print(e)
return
if not trees:
print("Could not parse sentence.")
return
# Print each tree with noun phrase chunks
for tree in trees:
tree.pretty_print()
print("Noun Phrase Chunks")
for np in np_chunk(tree):
print(" ".join(np.flatten()))
def preprocess(sentence):
"""
Convert `sentence` to a list of its words.
Pre-process sentence by converting all characters to lowercase
and removing any word that does not contain at least one alphabetic
character.
"""
# get tokens: each word, punctuation, any special character
# as a list
tokens = nltk.word_tokenize(sentence)
words = []
for word in tokens:
valid = False
for letter in word:
# if even one of the letters is an alphabet, the word is valid
if letter.isalpha():
valid = True
# add valid word to the list of words, in lowercase
if valid:
words.append(word.lower())
return words
def check(subtree):
"""
Return True if any child of 'subtree' has the label "NP"
"""
# if the subtree itself is an NP, then return True
if subtree.label() == "NP":
return True
# if the subtree has only one child, then its probably a terminal node
# in which case return False
# but the label must not be 'S' since
# S can contain only one child when S -> VP is followed by the parse tree
if len(subtree) == 1 and subtree.label() != 'S':
return False
# otherwise go further into the subtree
# and evaluate each subsubtree there
for subsubtree in subtree:
if check(subsubtree):
return True
return False
def np_chunk(tree):
"""
Return a list of all noun phrase chunks in the sentence tree.
A noun phrase chunk is defined as any subtree of the sentence
whose label is "NP" that does not itself contain any other
noun phrases as subtrees.
"""
chunks = []
for subtree in tree:
# get the label of the subtree defined by the grammar
node = subtree.label()
# check if this tree contains a subtree with 'NP'
# if not check another subtree
contains = check(subtree)
if not contains:
continue
# if the node is a NP or VP or S, then
# go further into the tree to check for noun phrase chunks
# at each point take the list of trees returned and
# append each to the actual chunks' list in the parent
if node == "NP" or node == "VP" or node == "S":
subsubtree = np_chunk(subtree)
for np in subsubtree:
chunks.append(np)
# if the current tree has no subtree with a 'NP' label
# and is itself an 'NP' labeled node then, append the tree to chunks
if tree.label() == "NP" and not contains:
chunks.append(tree)
return chunks
if __name__ == "__main__":
main()