Projects STRLCPY Maryam Commits 241d5baf
🤬
  • ■ ■ ■ ■ ■ ■
    maryam/core/util/iris/topic.py
    1  -# core/util/iris/topicmodeling.py
    2  -# Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb
     1 +# core/util/iris/topic.py
     2 +# Hatma Suryotrisongko
    3 3   
    4 4  import pandas as pd
    5 5  import numpy as np
    skipped 11 lines
    17 17   
    18 18  from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
    19 19   
     20 +from top2vec import Top2Vec
    20 21   
    21 22  class main:
    22 23   
    23  - def __init__(self, inputfile, filetype, showcharts, verbose):
     24 + def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
    24 25   
    25 26   if verbose == True:
    26 27   print("\n\n DATASET = reading file : " + inputfile)
     28 + print("\n\n Search keyword = " + keyword)
    27 29   
    28 30   if filetype == "csv":
    29 31   # tmp = pd.read_csv(inputfile, header=None, low_memory=False)
    skipped 145 lines
    175 177   return output
    176 178   
    177 179   
     180 + def run_search_topics_top2vec(self, keyword, showcharts, verbose):
     181 + 
     182 + print("\n\n Search Topics Using Top2Vec (caution: might not work well for a small dataset)")
     183 + print("\n the Search Keyword = " + keyword)
     184 + 
     185 + pretrained_embedding_model = "universal-sentence-encoder-multilingual"
     186 + if verbose == True:
     187 + print("\n\n Pretrained Embedding Model")
     188 + # https://tfhub.dev/google/universal-sentence-encoder-multilingual/
     189 + # 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) text encoder.
     190 + print(pretrained_embedding_model)
     191 + 
     192 + model = Top2Vec(documents=self.corpus.tolist(), speed="learn", workers=8)
     193 + if verbose == True:
     194 + print("\n Model = ")
     195 + print(model)
     196 + 
     197 + if model.get_num_topics() < 5:
     198 + ntopics = model.get_num_topics()
     199 + else:
     200 + ntopics = 5
     201 + 
     202 + topic_words, word_scores, topic_nums = model.get_topics(ntopics)
     203 + print(topic_words)
     204 + print(word_scores)
     205 + print(topic_nums)
     206 + 
     207 + print("\n Semantic Search Documents by Keywords = ")
     208 + documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=[keyword], num_docs=5)
     209 + for doc, score, doc_id in zip(documents, document_scores, document_ids):
     210 + print(f"Document: {doc_id}, Score: {score}")
     211 + print("-----------")
     212 + print(doc)
     213 + print("-----------")
     214 + print()
     215 + 
     216 + if showcharts == True:
     217 + print("\n\n Generate Word Clouds = ")
     218 + topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[keyword], num_topics=ntopics)
     219 + for topic in topic_nums:
     220 + model.generate_topic_wordcloud(topic)
     221 + 
     222 + print("\n Similar Keywords = ")
     223 + words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20)
     224 + for word, score in zip(words, word_scores):
     225 + print(f"{word} {score}")
     226 + 
  • ■ ■ ■ ■ ■
    maryam/modules/iris/topicmodeling.py
    1  -# TESTED USING =
     1 +# modules/iris/topicmodeling.py
     2 +# # TESTING The Main Function =
    2 3  # topicmodeling -i mixed.json -t json -m all-distilroberta-v1
    3 4  # topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
    4 5  # topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
    skipped 2 lines
    7 8  # topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
    8 9  # topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
    9 10  # topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
     11 +# # TESTING The Reporting & API Function =
     12 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --output
     13 +# report json testreport iris/topicmodeling
     14 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --api
     15 +# # TESTING The Function: Search Topics Using a Keyword (Top2Vec) =
     16 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 -k music
    10 17  # Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
     18 +# Hatma Suryotrisongko
    11 19   
    12 20   
    13 21  meta = {
    skipped 1 lines
    15 23   'author': 'Hatma Suryotrisongko',
    16 24   'version': '0.1.0',
    17 25   'description': 'Topic Modeling Algorithms.',
    18  - 'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim'),
     26 + 'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim', 'top2vec'),
    19 27   'options': (
    20 28   ('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
    21 29   ('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
     30 + ('keyword', None, False, 'Search keyword: ', '-k', 'store', str),
    22 31   ('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
    23 32   ('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
    24 33   ('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
    25 34   ),
    26  - 'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2')
     35 + 'examples': ('topicmodeling -i mixed.json -t json -k music -s True -v False -m all-mpnet-base-v2')
    27 36  }
    28 37   
    29 38   
    30 39  def module_api(self):
    31 40   
    32  - run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose'])
     41 + run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['keyword'], self.options['showcharts'], self.options['verbose'])
    33 42   run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
    34 43   
    35 44   results = run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
    skipped 6 lines
    42 51   
    43 52   inputfile = self.options['inputfile']
    44 53   self.save_gather(output, 'iris/topicmodeling', inputfile, output=self.options['output'])
     54 + 
     55 + if self.options['keyword'] is not None:
     56 + print(" keyword = " + self.options['keyword'])
     57 + run.run_search_topics_top2vec(self.options['keyword'], self.options['showcharts'], self.options['verbose'])
    45 58   
    46 59   return output
    47 60   
    skipped 3 lines
Please wait...
Page is in error, reload to recover