STRLCPY/Maryam

Merge pull request #269 from keamanansiber/TopicModeling_0.1.0
```
The initial version of the Topic Modeling modules (v0.1.0)
```
Saeed Dehqan committed with GitHub 2 years ago

fb460ae2

2 parents
d6168581
7e0c987b

Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)

■ ■ ■ ■ ■ ■

maryam/core/util/iris/plotly.py maryam/core/util/iris/plotlyutil.py

		skipped 11 lines
12	12		along with this program. If not, see <http://www.gnu.org/licenses/>.
13	13		"""
14	14
15		-	import plotly
	15	+	import plotlyutil
16	16
17	17		class main:
18	18
		skipped 20 lines

■ ■ ■ ■ ■ ■

maryam/core/util/iris/topic.py

1	+	# core/util/iris/topic.py
2	+	# Hatma Suryotrisongko
3	+
4	+	import pandas as pd
5	+	import numpy as np
6	+	import matplotlib.pyplot as plt
7	+	from sentence_transformers import SentenceTransformer
8	+
9	+	class main:
10	+
11	+	def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
12	+
13	+	from dask import dataframe as dd
14	+	import json
15	+	from gensim.parsing.preprocessing import remove_stopwords
16	+
17	+	if verbose == True:
18	+	print("\n\n DATASET = reading file : " + inputfile)
19	+	print("\n\n Search keyword = " + keyword)
20	+
21	+	if filetype == "csv":
22	+	# tmp = pd.read_csv(inputfile, header=None, low_memory=False)
23	+	tmp = dd.read_csv(inputfile, sep=';', header=None)
24	+	tmp2 = tmp.to_dask_array(lengths=True)
25	+	tmp3 = tmp2.compute()
26	+	tmp4 = pd.DataFrame(tmp3)
27	+
28	+	if verbose == True:
29	+	print("\n\n csv file (before preprocessing) = ")
30	+	print(tmp4)
31	+
32	+	self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
33	+
34	+	elif filetype == "json":
35	+	with open(inputfile) as json_file:
36	+	jsonfile = json.load(json_file)
37	+
38	+	tmp = pd.DataFrame(jsonfile['results'])
39	+
40	+	if verbose == True:
41	+	print("\n\n json file (before preprocessing) = ")
42	+	print(tmp)
43	+
44	+	tmp['td'] = tmp['t'] + ' ' + tmp['d']
45	+	self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
46	+
47	+	else:
48	+	print("ERROR, only accept csv or json file!")
49	+
50	+	if verbose == True:
51	+	print("\n\n number of corpus = ")
52	+	print(len(self.corpus))
53	+	print("\n\n self.corpus[0] = ")
54	+	print(self.corpus[0])
55	+	print("\n\n all self.corpus = ")
56	+	print(self.corpus)
57	+
58	+	if showcharts == True:
59	+	print("\n\n histogram of the number of words in each corpus")
60	+	pd.Series([len(e.split()) for e in self.corpus]).hist()
61	+	plt.show()
62	+
63	+	def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
64	+
65	+	from sklearn.cluster import KMeans
66	+	import scipy
67	+	import umap
68	+
69	+	pretrained_model = selected_pretrained_model
70	+	if verbose == True:
71	+	print("\n\n Model selection")
72	+	# https://www.sbert.net/docs/pretrained_models.html
73	+	print(pretrained_model)
74	+
75	+	model = SentenceTransformer(pretrained_model)
76	+	if verbose == True:
77	+	print(model)
78	+
79	+	corpus_embeddings = model.encode(self.corpus)
80	+	if verbose == True:
81	+	print("\n\n CORPUS EMBEDDING")
82	+	print(corpus_embeddings.shape)
83	+	print(corpus_embeddings)
84	+
85	+	K = 5
86	+	kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings)
87	+	if verbose == True:
88	+	print("\n\n Show Cluster using SkLearn KMeans")
89	+	print(kmeans)
90	+
91	+	corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus})
92	+	print("\n\n corpus_labeled = ")
93	+	print(corpus_labeled)
94	+
95	+	cls_dist = pd.Series(kmeans.labels_).value_counts()
96	+	if verbose == True:
97	+	print("\n\n frequency of cluster label = ")
98	+	print(cls_dist)
99	+
100	+	distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings)
101	+	if verbose == True:
102	+	print("\n\n calculate distance of cluster's center point = ")
103	+	print(distances)
104	+
105	+	print("\n\n Cluster's center example = ")
106	+
107	+	centers = {}
108	+	print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t")
109	+	for i, d in enumerate(distances):
110	+	ind = np.argsort(d, axis=0)[0]
111	+	centers[i] = ind
112	+	print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t")
113	+
114	+	if showcharts == True:
115	+	print("\n\n Visualization of the cluster points")
116	+
117	+	X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings)
118	+	labels = kmeans.labels_
119	+
120	+	fig, ax = plt.subplots(figsize=(12, 8))
121	+	plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired')
122	+	for c in centers:
123	+	plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24)
124	+	plt.colorbar()
125	+	plt.show()
126	+
127	+	def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose):
128	+
129	+	from bertopic import BERTopic
130	+
131	+	pretrained_model = selected_pretrained_model
132	+	if verbose == True:
133	+	print("\n\n Model selection")
134	+	# https://www.sbert.net/docs/pretrained_models.html
135	+	print(pretrained_model)
136	+
137	+	model = SentenceTransformer(pretrained_model)
138	+	if verbose == True:
139	+	print(model)
140	+
141	+	corpus_embeddings = model.encode(self.corpus)
142	+	if verbose == True:
143	+	print("\n\n CORPUS EMBEDDING")
144	+	print(corpus_embeddings.shape)
145	+	print(corpus_embeddings)
146	+
147	+	print("\n\n Topic Modeling with BERTopic")
148	+
149	+	sentence_model = SentenceTransformer(pretrained_model)
150	+	if verbose == True:
151	+	print(sentence_model)
152	+
153	+	topic_model = BERTopic(embedding_model=sentence_model)
154	+	if verbose == True:
155	+	print(topic_model)
156	+
157	+	topics, _ = topic_model.fit_transform(self.corpus)
158	+	print(topic_model.get_topic_info()[:6])
159	+	output = topic_model.get_topic_info()
160	+
161	+	corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus})
162	+	if verbose == True:
163	+	print("\n\n corpus_labeled = ")
164	+	print(corpus_labeled)
165	+
166	+	print("\n\n topics for each cluster = ")
167	+
168	+	i = 0
169	+	while i < len(topic_model.get_topic_info()):
170	+	print("Cluster #" + str(i) + " = ")
171	+	print(topic_model.get_topic(i))
172	+	i += 1
173	+
174	+	return output
175	+
176	+
177	+	def run_search_topics_top2vec(self, keyword, showcharts, verbose):
178	+
179	+	from top2vec import Top2Vec
180	+
181	+	print("\n\n Search Topics Using Top2Vec (caution: might not work well for a small dataset)")
182	+	print("\n the Search Keyword = " + keyword)
183	+
184	+	pretrained_embedding_model = "universal-sentence-encoder-multilingual"
185	+	if verbose == True:
186	+	print("\n\n Pretrained Embedding Model")
187	+	# https://tfhub.dev/google/universal-sentence-encoder-multilingual/
188	+	# 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) text encoder.
189	+	print(pretrained_embedding_model)
190	+
191	+	model = Top2Vec(documents=self.corpus.tolist(), speed="learn", workers=8)
192	+	if verbose == True:
193	+	print("\n Model = ")
194	+	print(model)
195	+
196	+	if model.get_num_topics() < 5:
197	+	ntopics = model.get_num_topics()
198	+	else:
199	+	ntopics = 5
200	+
201	+	topic_words, word_scores, topic_nums = model.get_topics(ntopics)
202	+	print(topic_words)
203	+	print(word_scores)
204	+	print(topic_nums)
205	+
206	+	print("\n Semantic Search Documents by Keywords = ")
207	+	documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=[keyword], num_docs=5)
208	+	for doc, score, doc_id in zip(documents, document_scores, document_ids):
209	+	print(f"Document: {doc_id}, Score: {score}")
210	+	print("-----------")
211	+	print(doc)
212	+	print("-----------")
213	+	print()
214	+
215	+	if showcharts == True:
216	+	print("\n\n Generate Word Clouds = ")
217	+	topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[keyword], num_topics=ntopics)
218	+	for topic in topic_nums:
219	+	model.generate_topic_wordcloud(topic)
220	+
221	+	print("\n Similar Keywords = ")
222	+	words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20)
223	+	for word, score in zip(words, word_scores):
224	+	print(f"{word} {score}")
225	+

■ ■ ■ ■ ■ ■

maryam/modules/iris/topicmodeling.py

1	+	# modules/iris/topicmodeling.py
2	+	# # TESTING The Main Function =
3	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
4	+	# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
5	+	# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
6	+	# topicmodeling -i mixed.json -t json -v -m all-distilroberta-v1
7	+	# topicmodeling -i testdataset.csv -t csv -m all-mpnet-base-v2
8	+	# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
9	+	# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
10	+	# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
11	+	# # TESTING The Reporting & API Function =
12	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --output
13	+	# report json testreport iris/topicmodeling
14	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --api
15	+	# # TESTING The Function: Search Topics Using a Keyword (Top2Vec) =
16	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 -k music
17	+	# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
18	+	# Hatma Suryotrisongko
19	+
20	+
21	+	meta = {
22	+	'name': 'Topic Modeling',
23	+	'author': 'Hatma Suryotrisongko',
24	+	'version': '0.1.0',
25	+	'description': 'Topic Modeling Algorithms.',
26	+	'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim', 'top2vec'),
27	+	'options': (
28	+	('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
29	+	('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
30	+	('keyword', None, False, 'Search keyword: ', '-k', 'store', str),
31	+	('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
32	+	('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
33	+	('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
34	+	),
35	+	'examples': ('topicmodeling -i mixed.json -t json -k music -s True -v False -m all-mpnet-base-v2')
36	+	}
37	+
38	+
39	+	def module_api(self):
40	+
41	+	run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['keyword'], self.options['showcharts'], self.options['verbose'])
42	+	run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
43	+
44	+	results = run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
45	+	self.output("\n\nResults = \n")
46	+	self.output( results )
47	+
48	+	output = {'results': results.to_json(orient="records") }
49	+
50	+	inputfile = self.options['inputfile']
51	+	self.save_gather(output, 'iris/topicmodeling', inputfile, output=self.options['output'])
52	+
53	+	if self.options['keyword'] is not None:
54	+	self.output(" keyword = " + self.options['keyword'])
55	+	run.run_search_topics_top2vec(self.options['keyword'], self.options['showcharts'], self.options['verbose'])
56	+
57	+	return output
58	+
59	+
60	+	def module_run(self):
61	+	output = module_api(self)
62	+	self.output("\n\nOutput = \n")
63	+	self.output( output )

■ ■ ■ ■ ■ ■

setup.py

		skipped 43 lines
44	44		'nltk',
45	45		'matplotlib',
46	46		'pandas',
47		-	'wordcloud'
	47	+	'wordcloud',
	48	+	'numpy',
	49	+	'dask',
	50	+	'scikit-learn',
	51	+	'scipy',
	52	+	'umap',
	53	+	'bertopic',
	54	+	'sentence_transformers',
	55	+	'gensim',
	56	+	'top2vec'
48	57		],
49	58		classifiers=[
50	59		'Programming Language :: Python :: 3.10',
		skipped 8 lines

Merge pull request #269 from keamanansiber/TopicModeling_0.1.0