■ ■ ■ ■ ■ ■
maryam/modules/iris/topicmodeling.py
1 - # TESTED USING = 1 + # modules/iris/topicmodeling.py 2 + # # TESTING The Main Function = 2 3 # topicmodeling -i mixed.json -t json -m all-distilroberta-v1 3 4 # topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1 4 5 # topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1 skipped 2 lines 7 8 # topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2 8 9 # topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2 9 10 # topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2 11 + # # TESTING The Reporting & API Function = 12 + # topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --output 13 + # report json testreport iris/topicmodeling 14 + # topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --api 15 + # # TESTING The Function: Search Topics Using a Keyword (Top2Vec) = 16 + # topicmodeling -i mixed.json -t json -m all-distilroberta-v1 -k music 10 17 # Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks 18 + # Hatma Suryotrisongko 11 19 12 20 13 21 meta = { skipped 1 lines 15 23 'author': 'Hatma Suryotrisongko', 16 24 'version': '0.1.0', 17 25 'description': 'Topic Modeling Algorithms.', 18 - 'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim'), 26 + 'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim', ' top2vec ' ), 19 27 'options': ( 20 28 ('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str), 21 29 ('filetype', None, True, 'File type: csv/json', '-t', 'store', str), 30 + ('keyword', None, False, 'Search keyword: ', '-k', 'store', str), 22 31 ('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool), 23 32 ('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool), 24 33 ('pretrained_model', None, True, 'model for embedding', '-m', 'store', str), 25 34 ), 26 - 'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2') 35 + 'examples': ('topicmodeling -i mixed.json -t json -k music - s True -v False -m all-mpnet-base-v2') 27 36 } 28 37 29 38 30 39 def module_api(self): 31 40 32 - run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose']) 41 + run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['keyword ' ] , self . options [ ' showcharts'], self.options['verbose']) 33 42 run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose']) 34 43 35 44 results = run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose']) skipped 6 lines 42 51 43 52 inputfile = self.options['inputfile'] 44 53 self.save_gather(output, 'iris/topicmodeling', inputfile, output=self.options['output']) 54 + 55 + if self.options['keyword'] is not None: 56 + print(" keyword = " + self.options['keyword']) 57 + run.run_search_topics_top2vec(self.options['keyword'], self.options['showcharts'], self.options['verbose']) 45 58 46 59 return output 47 60 skipped 3 lines