Skip to content

Commit

Permalink
Merge pull request #24 from ProjetPP/lemmatization
Browse files Browse the repository at this point in the history
Lemmatization
  • Loading branch information
Ezibenroc committed Nov 8, 2014
2 parents 55d129c + 5c63b7e commit 11b0e0f
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 12 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ python:
- pypy3

install:
- pip install scrutinizer-ocular coverage webtest httmock requests ppp_datamodel ppp_core jsonrpclib-pelix
- pip install scrutinizer-ocular coverage webtest httmock requests ppp_datamodel ppp_core jsonrpclib-pelix nltk
- python -m nltk.downloader wordnet

before_script:
- ./setup.py install
Expand Down
2 changes: 1 addition & 1 deletion demo/demo4.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_tree():
nlp = StanfordNLP()
result = nlp.parse(input(""))
tree = ppp_nlp_classical.computeTree(result['sentences'][0])
#ppp_nlp_classical.simplify(tree)
ppp_nlp_classical.simplify(tree)
return tree

print(get_tree())
18 changes: 18 additions & 0 deletions ppp_nlp_classical/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import sys
from .preprocessingMerge import mergeQuotations, mergeNamedEntityTag
from nltk.stem.wordnet import WordNetLemmatizer

class DependenciesTree:
"""
Expand Down Expand Up @@ -111,6 +112,21 @@ def initText(t,s):
for c in t.child:
initText(c,s)

def normalizeWord(word,lmtzr):
"""
Apply lemmatization to the given word.
"""
result=lmtzr.lemmatize(word,'n')
if len(result)<len(word):
return result
return lmtzr.lemmatize(word,'v')

def normalize(t,lmtzr):
for c in t.child:
normalize(c,lmtzr)
if t.namedEntityTag == 'undef':
t.wordList = list((normalizeWord(w[0],lmtzr),w[1]) for w in t.wordList)

def computeTree(r):
"""
Compute the dependence tree.
Expand All @@ -125,4 +141,6 @@ def computeTree(r):
initText(tree,r['text'].replace('"','\\\"'))
mergeQuotations(tree,r) # quotation merging
mergeNamedEntityTag(tree) # NER merging
lmtzr = WordNetLemmatizer()
normalize(tree,lmtzr)
return tree
7 changes: 7 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,15 @@
install_requires=[
'ppp_datamodel>=0.5,<0.6',
'ppp_core>=0.5,<0.6',
'jsonrpclib-pelix',
'nltk'
],
packages=[
'ppp_nlp_classical',
],
)

import sys
if 'install' in sys.argv:
import nltk
nltk.download("wordnet")
8 changes: 4 additions & 4 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def give_john_smith():
def give_john_smith_string():
s="digraph relations {\n"
s+="\t\"ROOT0\"[label=\"ROOT\",shape=box];\n"
s+="\t\"ROOT0\" -> \"lives3\"[label=\"root\"];\n"
s+="\t\"lives3\"[label=\"lives\",shape=box];\n"
s+="\t\"lives3\" -> \"John1\"[label=\"nsubj\"];\n"
s+="\t\"lives3\" -> \"United6\"[label=\"prep_in\"];\n"
s+="\t\"ROOT0\" -> \"life3\"[label=\"root\"];\n"
s+="\t\"life3\"[label=\"life\",shape=box];\n"
s+="\t\"life3\" -> \"John1\"[label=\"nsubj\"];\n"
s+="\t\"life3\" -> \"United6\"[label=\"prep_in\"];\n"
s+="\t\"John1\"[label=\"John Smith [PERSON]\",shape=box];\n"
s+="\t\"United6\"[label=\"United Kingdom [LOCATION]\",shape=box];\n"
s+="\t\"United6\" -> \"the5\"[label=\"det\"];\n"
Expand Down
6 changes: 3 additions & 3 deletions tests/test_dependencytree.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def testQuotationMerge(self):
self.assertEqual(len(root.child),1)
# Wrote
wrote=root.child[0]
self.assertEqual(wrote.wordList,[("wrote",2)])
self.assertEqual(wrote.wordList,[("write",2)])
self.assertEqual(wrote.namedEntityTag,'undef')
self.assertEqual(wrote.dependency,'root')
self.assertEqual(wrote.parent,root)
Expand Down Expand Up @@ -88,7 +88,7 @@ def testEntityTagMerge1(self):
self.assertEqual(len(root.child),1)
# Lives
lives=root.child[0]
self.assertEqual(lives.wordList,[("lives",3)])
self.assertEqual(lives.wordList,[("life",3)])
self.assertEqual(lives.namedEntityTag,'undef')
self.assertEqual(lives.dependency,'root')
self.assertEqual(lives.parent,tree)
Expand Down Expand Up @@ -126,7 +126,7 @@ def testEntityTagMerge2(self):
self.assertEqual(len(root.child),1)
# Is
is_=root.child[0]
self.assertEqual(is_.wordList,[("is",2)])
self.assertEqual(is_.wordList,[("be",2)])
self.assertEqual(is_.namedEntityTag,'undef')
self.assertEqual(is_.dependency,'root')
self.assertEqual(is_.parent,tree)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def testHierarchySimplification(self):
self.assertEqual(len(root.child),1)
# Is
is_=root.child[0]
self.assertEqual(is_.wordList,[("is",2)])
self.assertEqual(is_.wordList,[("be",2)])
self.assertEqual(is_.namedEntityTag,'undef')
self.assertEqual(is_.dependency,'t0')
self.assertEqual(is_.parent,root)
Expand Down Expand Up @@ -59,7 +59,7 @@ def testIgnore(self):
self.assertEqual(len(root.child),1)
# Are
are=root.child[0]
self.assertEqual(are.wordList,[("are",3)])
self.assertEqual(are.wordList,[("be",3)])
self.assertEqual(are.namedEntityTag,'undef')
self.assertEqual(are.dependency,'t0')
self.assertEqual(are.parent,root)
Expand All @@ -77,7 +77,7 @@ def testHierarchySimplification2(self):
self.assertEqual(len(root.child),1)
# Is
is_=root.child[0]
self.assertEqual(is_.wordList,[("is",2)])
self.assertEqual(is_.wordList,[("be",2)])
self.assertEqual(is_.namedEntityTag,'undef')
self.assertEqual(is_.dependency,'t0')
self.assertEqual(is_.parent,root)
Expand Down

0 comments on commit 11b0e0f

Please sign in to comment.