STRLCPY/Maryam

Adding Function: Search Topics Using a Keyword (Top2Vec)
Hatma Suryotrisongko committed 2 years ago

241d5baf

1 parent f46565d6

■ ■ ■ ■ ■ ■ ■

maryam/core/util/iris/topic.py

1		-	# core/util/iris/topicmodeling.py
2		-	# Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb
	1	+	# core/util/iris/topic.py
	2	+	# Hatma Suryotrisongko
3	3
4	4		import pandas as pd
5	5		import numpy as np
		skipped 11 lines
17	17
18	18		from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
19	19
	20	+	from top2vec import Top2Vec
20	21
21	22		class main:
22	23
23		-	def __init__(self, inputfile, filetype, showcharts, verbose):
	24	+	def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
24	25
25	26		if verbose == True:
26	27		print("\n\n DATASET = reading file : " + inputfile)
	28	+	print("\n\n Search keyword = " + keyword)
27	29
28	30		if filetype == "csv":
29	31		# tmp = pd.read_csv(inputfile, header=None, low_memory=False)
		skipped 145 lines
175	177		return output
176	178
177	179
	180	+	def run_search_topics_top2vec(self, keyword, showcharts, verbose):
	181	+
	182	+	print("\n\n Search Topics Using Top2Vec (caution: might not work well for a small dataset)")
	183	+	print("\n the Search Keyword = " + keyword)
	184	+
	185	+	pretrained_embedding_model = "universal-sentence-encoder-multilingual"
	186	+	if verbose == True:
	187	+	print("\n\n Pretrained Embedding Model")
	188	+	# https://tfhub.dev/google/universal-sentence-encoder-multilingual/
	189	+	# 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) text encoder.
	190	+	print(pretrained_embedding_model)
	191	+
	192	+	model = Top2Vec(documents=self.corpus.tolist(), speed="learn", workers=8)
	193	+	if verbose == True:
	194	+	print("\n Model = ")
	195	+	print(model)
	196	+
	197	+	if model.get_num_topics() < 5:
	198	+	ntopics = model.get_num_topics()
	199	+	else:
	200	+	ntopics = 5
	201	+
	202	+	topic_words, word_scores, topic_nums = model.get_topics(ntopics)
	203	+	print(topic_words)
	204	+	print(word_scores)
	205	+	print(topic_nums)
	206	+
	207	+	print("\n Semantic Search Documents by Keywords = ")
	208	+	documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=[keyword], num_docs=5)
	209	+	for doc, score, doc_id in zip(documents, document_scores, document_ids):
	210	+	print(f"Document: {doc_id}, Score: {score}")
	211	+	print("-----------")
	212	+	print(doc)
	213	+	print("-----------")
	214	+	print()
	215	+
	216	+	if showcharts == True:
	217	+	print("\n\n Generate Word Clouds = ")
	218	+	topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[keyword], num_topics=ntopics)
	219	+	for topic in topic_nums:
	220	+	model.generate_topic_wordcloud(topic)
	221	+
	222	+	print("\n Similar Keywords = ")
	223	+	words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20)
	224	+	for word, score in zip(words, word_scores):
	225	+	print(f"{word} {score}")
	226	+

■ ■ ■ ■ ■ ■

maryam/modules/iris/topicmodeling.py

1		-	# TESTED USING =
	1	+	# modules/iris/topicmodeling.py
	2	+	# # TESTING The Main Function =
2	3		# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
3	4		# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
4	5		# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
		skipped 2 lines
7	8		# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
8	9		# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
9	10		# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
	11	+	# # TESTING The Reporting & API Function =
	12	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --output
	13	+	# report json testreport iris/topicmodeling
	14	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --api
	15	+	# # TESTING The Function: Search Topics Using a Keyword (Top2Vec) =
	16	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 -k music
10	17		# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
	18	+	# Hatma Suryotrisongko
11	19
12	20
13	21		meta = {
		skipped 1 lines
15	23		'author': 'Hatma Suryotrisongko',
16	24		'version': '0.1.0',
17	25		'description': 'Topic Modeling Algorithms.',
18		-	'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim'),
	26	+	'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim', 'top2vec'),
19	27		'options': (
20	28		('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
21	29		('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
	30	+	('keyword', None, False, 'Search keyword: ', '-k', 'store', str),
22	31		('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
23	32		('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
24	33		('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
25	34		),
26		-	'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2')
	35	+	'examples': ('topicmodeling -i mixed.json -t json -k music -s True -v False -m all-mpnet-base-v2')
27	36		}
28	37
29	38
30	39		def module_api(self):
31	40
32		-	run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose'])
	41	+	run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['keyword'], self.options['showcharts'], self.options['verbose'])
33	42		run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
34	43
35	44		results = run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
		skipped 6 lines
42	51
43	52		inputfile = self.options['inputfile']
44	53		self.save_gather(output, 'iris/topicmodeling', inputfile, output=self.options['output'])
	54	+
	55	+	if self.options['keyword'] is not None:
	56	+	print(" keyword = " + self.options['keyword'])
	57	+	run.run_search_topics_top2vec(self.options['keyword'], self.options['showcharts'], self.options['verbose'])
45	58
46	59		return output
47	60
		skipped 3 lines

Adding Function: Search Topics Using a Keyword (Top2Vec)