STRLCPY/Maryam

The initial version of the Topic Modeling modules (v0.1.0), contributed by Hatma Suryotrisongko, consists of: - maryam/core/util/iris/topic.py - maryam/modules/iris/topicmodeling.py
[email protected] committed 2 years ago

314a2fbc

1 parent 5f2779b0

■ ■ ■ ■ ■ ■

maryam/core/util/iris/topic.py

1	+	# core/util/iris/topicmodeling.py
2	+	# Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb
3	+
4	+	import pandas as pd
5	+	import numpy as np
6	+	import json
7	+	import csv
8	+	from dask import dataframe as dd
9	+
10	+	from sklearn.cluster import KMeans
11	+	import scipy
12	+	import matplotlib.pyplot as plt
13	+	import umap
14	+
15	+	from bertopic import BERTopic
16	+	from sentence_transformers import SentenceTransformer
17	+
18	+	from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
19	+
20	+
21	+	class main:
22	+
23	+	def __init__(self, inputfile, filetype, showcharts, verbose):
24	+
25	+	if verbose == True:
26	+	print("\n\n DATASET = reading file : " + inputfile)
27	+
28	+	if filetype == "csv":
29	+	# tmp = pd.read_csv(inputfile, header=None, low_memory=False)
30	+	tmp = dd.read_csv(inputfile, sep=';', header=None)
31	+	tmp2 = tmp.to_dask_array(lengths=True)
32	+	tmp3 = tmp2.compute()
33	+	tmp4 = pd.DataFrame(tmp3)
34	+
35	+	if verbose == True:
36	+	print("\n\n csv file (before preprocessing) = ")
37	+	print(tmp4)
38	+
39	+	self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
40	+
41	+	elif filetype == "json":
42	+	with open(inputfile) as json_file:
43	+	jsonfile = json.load(json_file)
44	+
45	+	tmp = pd.DataFrame(jsonfile['results'])
46	+
47	+	if verbose == True:
48	+	print("\n\n json file (before preprocessing) = ")
49	+	print(tmp)
50	+
51	+	tmp['td'] = tmp['t'] + ' ' + tmp['d']
52	+	self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
53	+
54	+	else:
55	+	print("ERROR, only accept csv or json file!")
56	+
57	+	if verbose == True:
58	+	print("\n\n number of corpus = ")
59	+	print(len(self.corpus))
60	+	print("\n\n self.corpus[0] = ")
61	+	print(self.corpus[0])
62	+	print("\n\n all self.corpus = ")
63	+	print(self.corpus)
64	+
65	+	if showcharts == True:
66	+	print("\n\n histogram of the number of words in each corpus")
67	+	pd.Series([len(e.split()) for e in self.corpus]).hist()
68	+	plt.show()
69	+
70	+	def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
71	+
72	+	pretrained_model = selected_pretrained_model
73	+	if verbose == True:
74	+	print("\n\n Model selection")
75	+	# https://www.sbert.net/docs/pretrained_models.html
76	+	print(pretrained_model)
77	+
78	+	model = SentenceTransformer(pretrained_model)
79	+	if verbose == True:
80	+	print(model)
81	+
82	+	corpus_embeddings = model.encode(self.corpus)
83	+	if verbose == True:
84	+	print("\n\n CORPUS EMBEDDING")
85	+	print(corpus_embeddings.shape)
86	+	print(corpus_embeddings)
87	+
88	+	K = 5
89	+	kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings)
90	+	if verbose == True:
91	+	print("\n\n Show Cluster using SkLearn KMeans")
92	+	print(kmeans)
93	+
94	+	corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus})
95	+	print("\n\n corpus_labeled = ")
96	+	print(corpus_labeled)
97	+
98	+	cls_dist = pd.Series(kmeans.labels_).value_counts()
99	+	if verbose == True:
100	+	print("\n\n frequency of cluster label = ")
101	+	print(cls_dist)
102	+
103	+	distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings)
104	+	if verbose == True:
105	+	print("\n\n calculate distance of cluster's center point = ")
106	+	print(distances)
107	+
108	+	print("\n\n Cluster's center example = ")
109	+
110	+	centers = {}
111	+	print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t")
112	+	for i, d in enumerate(distances):
113	+	ind = np.argsort(d, axis=0)[0]
114	+	centers[i] = ind
115	+	print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t")
116	+
117	+	if showcharts == True:
118	+	print("\n\n Visualization of the cluster points")
119	+
120	+	X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings)
121	+	labels = kmeans.labels_
122	+
123	+	fig, ax = plt.subplots(figsize=(12, 8))
124	+	plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired')
125	+	for c in centers:
126	+	plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24)
127	+	plt.colorbar()
128	+	plt.show()
129	+
130	+	def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose):
131	+
132	+	pretrained_model = selected_pretrained_model
133	+	if verbose == True:
134	+	print("\n\n Model selection")
135	+	# https://www.sbert.net/docs/pretrained_models.html
136	+	print(pretrained_model)
137	+
138	+	model = SentenceTransformer(pretrained_model)
139	+	if verbose == True:
140	+	print(model)
141	+
142	+	corpus_embeddings = model.encode(self.corpus)
143	+	if verbose == True:
144	+	print("\n\n CORPUS EMBEDDING")
145	+	print(corpus_embeddings.shape)
146	+	print(corpus_embeddings)
147	+
148	+	print("\n\n Topic Modeling with BERTopic")
149	+
150	+	sentence_model = SentenceTransformer(pretrained_model)
151	+	if verbose == True:
152	+	print(sentence_model)
153	+
154	+	topic_model = BERTopic(embedding_model=sentence_model)
155	+	if verbose == True:
156	+	print(topic_model)
157	+
158	+	topics, _ = topic_model.fit_transform(self.corpus)
159	+	print(topic_model.get_topic_info()[:6])
160	+
161	+	corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus})
162	+	if verbose == True:
163	+	print("\n\n corpus_labeled = ")
164	+	print(corpus_labeled)
165	+
166	+	print("\n\n topics for each cluster = ")
167	+
168	+	i = 0
169	+	while i < len(topic_model.get_topic_info()):
170	+	print("Cluster #" + str(i) + " = ")
171	+	print(topic_model.get_topic(i))
172	+	i += 1
173	+
174	+

■ ■ ■ ■ ■ ■

maryam/modules/iris/topicmodeling.py

1	+	# TESTED USING =
2	+	# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
3	+	# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
4	+	# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
5	+	# topicmodeling -i mixed.json -t json -v -m all-distilroberta-v1
6	+	# topicmodeling -i testdataset.csv -t csv -m all-mpnet-base-v2
7	+	# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
8	+	# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
9	+	# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
10	+	# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
11	+
12	+
13	+	meta = {
14	+	'name': 'Topic Modeling',
15	+	'author': 'Hatma Suryotrisongko',
16	+	'version': '0.1.0',
17	+	'description': 'Topic Modeling Algorithms.',
18	+	'options': (
19	+	('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
20	+	('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
21	+	('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
22	+	('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
23	+	('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
24	+	),
25	+	'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2')
26	+	}
27	+
28	+
29	+	def module_api(self):
30	+
31	+	run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose'])
32	+	run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
33	+	run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
34	+
35	+
36	+	def module_run(self):
37	+	module_api(self)

The initial version of the Topic Modeling modules (v0.1.0), contributed by Hatma Suryotrisongko, consists of: - maryam/core/util/iris/topic.py - maryam/modules/iris/topicmodeling.py