From 589b63f008b40cdef186b71756ed0cb9c222b5fb Mon Sep 17 00:00:00 2001 From: Tom Cornebize Date: Thu, 6 Nov 2014 21:24:00 +0100 Subject: [PATCH 1/4] Add lemmatization. --- .travis.yml | 2 +- demo/demo4.py | 2 +- ppp_nlp_classical/preprocessing.py | 18 ++++++++++++++++++ tests/data.py | 8 ++++---- tests/test_dependencytree.py | 6 +++--- tests/test_hierarchy.py | 6 +++--- 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index 839f488..72847cf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ python: - pypy3 install: - - pip install scrutinizer-ocular coverage webtest httmock requests ppp_datamodel ppp_core jsonrpclib-pelix + - pip install scrutinizer-ocular coverage webtest httmock requests ppp_datamodel ppp_core jsonrpclib-pelix nltk before_script: - ./setup.py install diff --git a/demo/demo4.py b/demo/demo4.py index 96b6cb7..0206c71 100644 --- a/demo/demo4.py +++ b/demo/demo4.py @@ -21,7 +21,7 @@ def get_tree(): nlp = StanfordNLP() result = nlp.parse(input("")) tree = ppp_nlp_classical.computeTree(result['sentences'][0]) - #ppp_nlp_classical.simplify(tree) + ppp_nlp_classical.simplify(tree) return tree print(get_tree()) diff --git a/ppp_nlp_classical/preprocessing.py b/ppp_nlp_classical/preprocessing.py index 1990ff8..fb3e512 100644 --- a/ppp_nlp_classical/preprocessing.py +++ b/ppp_nlp_classical/preprocessing.py @@ -2,6 +2,7 @@ import sys from .preprocessingMerge import mergeQuotations, mergeNamedEntityTag +from nltk.stem.wordnet import WordNetLemmatizer class DependenciesTree: """ @@ -111,6 +112,21 @@ def initText(t,s): for c in t.child: initText(c,s) +def normalizeWord(word,lmtzr): + """ + Apply lemmatization to the given word. + """ + result=lmtzr.lemmatize(word,'n') + if len(result) \"lives3\"[label=\"root\"];\n" - s+="\t\"lives3\"[label=\"lives\",shape=box];\n" - s+="\t\"lives3\" -> \"John1\"[label=\"nsubj\"];\n" - s+="\t\"lives3\" -> \"United6\"[label=\"prep_in\"];\n" + s+="\t\"ROOT0\" -> \"life3\"[label=\"root\"];\n" + s+="\t\"life3\"[label=\"life\",shape=box];\n" + s+="\t\"life3\" -> \"John1\"[label=\"nsubj\"];\n" + s+="\t\"life3\" -> \"United6\"[label=\"prep_in\"];\n" s+="\t\"John1\"[label=\"John Smith [PERSON]\",shape=box];\n" s+="\t\"United6\"[label=\"United Kingdom [LOCATION]\",shape=box];\n" s+="\t\"United6\" -> \"the5\"[label=\"det\"];\n" diff --git a/tests/test_dependencytree.py b/tests/test_dependencytree.py index 8afaf36..6b8f322 100644 --- a/tests/test_dependencytree.py +++ b/tests/test_dependencytree.py @@ -50,7 +50,7 @@ def testQuotationMerge(self): self.assertEqual(len(root.child),1) # Wrote wrote=root.child[0] - self.assertEqual(wrote.wordList,[("wrote",2)]) + self.assertEqual(wrote.wordList,[("write",2)]) self.assertEqual(wrote.namedEntityTag,'undef') self.assertEqual(wrote.dependency,'root') self.assertEqual(wrote.parent,root) @@ -88,7 +88,7 @@ def testEntityTagMerge1(self): self.assertEqual(len(root.child),1) # Lives lives=root.child[0] - self.assertEqual(lives.wordList,[("lives",3)]) + self.assertEqual(lives.wordList,[("life",3)]) self.assertEqual(lives.namedEntityTag,'undef') self.assertEqual(lives.dependency,'root') self.assertEqual(lives.parent,tree) @@ -126,7 +126,7 @@ def testEntityTagMerge2(self): self.assertEqual(len(root.child),1) # Is is_=root.child[0] - self.assertEqual(is_.wordList,[("is",2)]) + self.assertEqual(is_.wordList,[("be",2)]) self.assertEqual(is_.namedEntityTag,'undef') self.assertEqual(is_.dependency,'root') self.assertEqual(is_.parent,tree) diff --git a/tests/test_hierarchy.py b/tests/test_hierarchy.py index 374e9dc..40272a0 100644 --- a/tests/test_hierarchy.py +++ b/tests/test_hierarchy.py @@ -27,7 +27,7 @@ def testHierarchySimplification(self): self.assertEqual(len(root.child),1) # Is is_=root.child[0] - self.assertEqual(is_.wordList,[("is",2)]) + self.assertEqual(is_.wordList,[("be",2)]) self.assertEqual(is_.namedEntityTag,'undef') self.assertEqual(is_.dependency,'t0') self.assertEqual(is_.parent,root) @@ -59,7 +59,7 @@ def testIgnore(self): self.assertEqual(len(root.child),1) # Are are=root.child[0] - self.assertEqual(are.wordList,[("are",3)]) + self.assertEqual(are.wordList,[("be",3)]) self.assertEqual(are.namedEntityTag,'undef') self.assertEqual(are.dependency,'t0') self.assertEqual(are.parent,root) @@ -77,7 +77,7 @@ def testHierarchySimplification2(self): self.assertEqual(len(root.child),1) # Is is_=root.child[0] - self.assertEqual(is_.wordList,[("is",2)]) + self.assertEqual(is_.wordList,[("be",2)]) self.assertEqual(is_.namedEntityTag,'undef') self.assertEqual(is_.dependency,'t0') self.assertEqual(is_.parent,root) From 21cf7d437c0054f114cf3bceb7099025e60ef3c6 Mon Sep 17 00:00:00 2001 From: Tom Cornebize Date: Thu, 6 Nov 2014 21:37:26 +0100 Subject: [PATCH 2/4] Add wordnet installation in travis. --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 72847cf..4a10b90 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ python: install: - pip install scrutinizer-ocular coverage webtest httmock requests ppp_datamodel ppp_core jsonrpclib-pelix nltk + - python -m nltk.downloader wordnet before_script: - ./setup.py install From c619755ead1823cb5c444a478314fd8849d8b36a Mon Sep 17 00:00:00 2001 From: Tom Cornebize Date: Thu, 6 Nov 2014 22:50:35 +0100 Subject: [PATCH 3/4] Add dependencies. --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 64d1183..c0fbe72 100755 --- a/setup.py +++ b/setup.py @@ -27,6 +27,8 @@ install_requires=[ 'ppp_datamodel>=0.5,<0.6', 'ppp_core>=0.5,<0.6', + 'jsonrpclib-pelix', + 'nltk' ], packages=[ 'ppp_nlp_classical', From 5c63b7e7ed715fa6ba1d817ab9ef3ed1f0283957 Mon Sep 17 00:00:00 2001 From: Tom Cornebize Date: Fri, 7 Nov 2014 13:54:37 +0100 Subject: [PATCH 4/4] Add wordnet installation in setup.py. --- setup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.py b/setup.py index c0fbe72..7bbe2f4 100755 --- a/setup.py +++ b/setup.py @@ -34,3 +34,8 @@ 'ppp_nlp_classical', ], ) + +import sys +if 'install' in sys.argv: + import nltk + nltk.download("wordnet")