From 4132a50ad0766143b41ef0e2457c8807fe7e4acd Mon Sep 17 00:00:00 2001 From: swaroop Date: Sun, 12 Jun 2022 23:14:38 +0530 Subject: [PATCH 1/6] keyword_vectors empty case added --- lbl2vec/lbl2vec.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index 1992619..401a65e 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -623,8 +623,11 @@ def _get_similar_documents( # the list keywordword_vectors = [doc2vec_model.wv[word] for word in cleaned_keywords_list] - similar_docs = doc2vec_model.dv.most_similar( - positive=keywordword_vectors, topn=num_docs) + if keywordword_vectors != []: + similar_docs = doc2vec_model.dv.most_similar( + positive=keywordword_vectors, topn=num_docs) + else: + logger.warning("No keywords found in this document") except KeyError as error: error.args = ( error.args[0] + " in trained Doc2Vec model. Either replace the keyword from the 'keywords_list' parameter or train a new Doc2Vec model that knows the keyword.",) + error.args[1:] From add987587ab3a8ee075c5807399429914efb8810 Mon Sep 17 00:00:00 2001 From: Ai-Marshal <61105967+Ai-Marshal@users.noreply.github.com> Date: Mon, 13 Jun 2022 01:55:25 +0530 Subject: [PATCH 2/6] Update lbl2vec.py --- lbl2vec/lbl2vec.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index 401a65e..70471ed 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -627,7 +627,11 @@ def _get_similar_documents( similar_docs = doc2vec_model.dv.most_similar( positive=keywordword_vectors, topn=num_docs) else: + return_docs_keys=[0]*num_docs + return_docs_similarity_scores=[0]*num_docs logger.warning("No keywords found in this document") + return pd.Series([return_docs_keys, return_docs_similarity_scores], index=[ + 'doc_keys', 'doc_similarity_scores']) except KeyError as error: error.args = ( error.args[0] + " in trained Doc2Vec model. Either replace the keyword from the 'keywords_list' parameter or train a new Doc2Vec model that knows the keyword.",) + error.args[1:] From a872381e8d6c78670a0d014afe73d0c1d9c1b592 Mon Sep 17 00:00:00 2001 From: swaroop Date: Mon, 13 Jun 2022 02:11:21 +0530 Subject: [PATCH 3/6] empty keyword_vectors case --- lbl2vec/lbl2vec.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index 1992619..ddd7e29 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -623,8 +623,15 @@ def _get_similar_documents( # the list keywordword_vectors = [doc2vec_model.wv[word] for word in cleaned_keywords_list] - similar_docs = doc2vec_model.dv.most_similar( - positive=keywordword_vectors, topn=num_docs) + if keywordword_vectors != []: + similar_docs = doc2vec_model.dv.most_similar( + positive=keywordword_vectors, topn=num_docs) + else: + return_docs_keys = [0] * num_docs + return_docs_similarity_scores = [0] * num_docs + logger.warning("No keywords found in this document") + return pd.Series([return_docs_keys, return_docs_similarity_scores], index=[ + 'doc_keys', 'doc_similarity_scores']) except KeyError as error: error.args = ( error.args[0] + " in trained Doc2Vec model. Either replace the keyword from the 'keywords_list' parameter or train a new Doc2Vec model that knows the keyword.",) + error.args[1:] From 8be9479d51ced794e437529fc6501afa51421606 Mon Sep 17 00:00:00 2001 From: swaroop Date: Fri, 22 Jul 2022 00:45:09 +0530 Subject: [PATCH 4/6] IndexError: list index out of range --- .DS_Store | Bin 0 -> 6148 bytes lbl2vec/lbl2vec.py | 3 +++ 2 files changed, 3 insertions(+) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..dbc4d86c74413f2043145731a861d2f3d0f923db GIT binary patch literal 6148 zcmeHKy-EW?5T4Z#g9@gwusrM}AVh?8awf5`5iwwG{)j@3i}-`(HNJv{wS}b$K7sa% zPY_$5!f$rhWH*=CDIzm4`|Zuo&g5I}HcLdReAX%w<%uWGw%mO7t-rtVyy^Q} z-}-xp&bQ&?6{)D(qL`X=;?<>vW!LA{y)IN%Dg|5D=0@?b?l`|!mv*^sj@ooVb&9A* zUCh1us(IeY^3+i?@c` z%cK~$4=QPXX241Vx<=>dr0MU@)h#+1iEw+_eyrE`%pRqphC4Mm%Ik;jm=}?sVE|_~ zOK#4fjKY90APfu`;O|2KWehDQ2KA={gT4X)GcaqxJkJwgU@^3q7=#C+Oe)Z%DtpCH zCLQ*`#)TFWgC?Dnu8ehTWo2(DN>_(H(BY&)gE9&O!hp}fw0X?)`M)*!{_iKrlQ19* z{3`}jrdq3(@ssS?n)-2k)&|f6C=16W26YMsdmQTtAH_>hE${_A0frV6gRns4kAS8@ K24Ub=8TbUa38T#b literal 0 HcmV?d00001 diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index 231d7da..a7ec9a9 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -431,10 +431,12 @@ def predict_new_docs( # ToDo: set swifter.progress_bar(self.verbose) again instead of swifter.progress_bar(False) when swifter resolved the TQDM progress bar issues # get document vectors of new documents if multiprocessing: + labeled_docs['doc_vec'] = labeled_docs['doc_word_tokens'].swifter.progress_bar( False).apply(lambda row: self.doc2vec_model.infer_vector(doc_words=row)) else: + labeled_docs['doc_vec'] = labeled_docs['doc_word_tokens'].apply( lambda row: self.doc2vec_model.infer_vector(doc_words=row)) @@ -638,6 +640,7 @@ def _get_similar_documents( error.args[0] + " in trained Doc2Vec model. Either replace the keyword from the 'keywords_list' parameter or train a new Doc2Vec model that knows the keyword.",) + error.args[1:] raise + doc_keys = [docs[0] for docs in similar_docs] doc_scores = [docs[1] for docs in similar_docs] From c25340aa7ca76c5767ba04ad3e6ecb2835c663b6 Mon Sep 17 00:00:00 2001 From: swaroop Date: Fri, 22 Jul 2022 00:59:44 +0530 Subject: [PATCH 5/6] test --- lbl2vec/lbl2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index a7ec9a9..47af8e6 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -643,7 +643,7 @@ def _get_similar_documents( doc_keys = [docs[0] for docs in similar_docs] doc_scores = [docs[1] for docs in similar_docs] - + print(doc_scores, min_num_docs) # add number of min_num_docs documents if too few documents are # chosen by simiarilty threshold alone if min_num_docs is not None and doc_scores[min_num_docs] < similarity_threshold and len( From 95ab7e65b6574c133d7a9c676b0c58ae5600640b Mon Sep 17 00:00:00 2001 From: swaroop Date: Fri, 22 Jul 2022 08:47:33 +0530 Subject: [PATCH 6/6] removed print statement --- lbl2vec/lbl2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lbl2vec/lbl2vec.py b/lbl2vec/lbl2vec.py index 47af8e6..7ceaabd 100755 --- a/lbl2vec/lbl2vec.py +++ b/lbl2vec/lbl2vec.py @@ -643,7 +643,7 @@ def _get_similar_documents( doc_keys = [docs[0] for docs in similar_docs] doc_scores = [docs[1] for docs in similar_docs] - print(doc_scores, min_num_docs) + # print(doc_scores, min_num_docs) # add number of min_num_docs documents if too few documents are # chosen by simiarilty threshold alone if min_num_docs is not None and doc_scores[min_num_docs] < similarity_threshold and len(