Projects STRLCPY Maryam Commits 314a2fbc
🤬
  • The initial version of the Topic Modeling modules (v0.1.0), contributed by Hatma Suryotrisongko, consists of: - maryam/core/util/iris/topic.py - maryam/modules/iris/topicmodeling.py

  • Loading...
  • [email protected] committed 2 years ago
    314a2fbc
    1 parent 5f2779b0
  • ■ ■ ■ ■ ■ ■
    maryam/core/util/iris/topic.py
     1 +# core/util/iris/topicmodeling.py
     2 +# Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb
     3 + 
     4 +import pandas as pd
     5 +import numpy as np
     6 +import json
     7 +import csv
     8 +from dask import dataframe as dd
     9 + 
     10 +from sklearn.cluster import KMeans
     11 +import scipy
     12 +import matplotlib.pyplot as plt
     13 +import umap
     14 + 
     15 +from bertopic import BERTopic
     16 +from sentence_transformers import SentenceTransformer
     17 + 
     18 +from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
     19 + 
     20 + 
     21 +class main:
     22 + 
     23 + def __init__(self, inputfile, filetype, showcharts, verbose):
     24 + 
     25 + if verbose == True:
     26 + print("\n\n DATASET = reading file : " + inputfile)
     27 + 
     28 + if filetype == "csv":
     29 + # tmp = pd.read_csv(inputfile, header=None, low_memory=False)
     30 + tmp = dd.read_csv(inputfile, sep=';', header=None)
     31 + tmp2 = tmp.to_dask_array(lengths=True)
     32 + tmp3 = tmp2.compute()
     33 + tmp4 = pd.DataFrame(tmp3)
     34 + 
     35 + if verbose == True:
     36 + print("\n\n csv file (before preprocessing) = ")
     37 + print(tmp4)
     38 + 
     39 + self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
     40 + 
     41 + elif filetype == "json":
     42 + with open(inputfile) as json_file:
     43 + jsonfile = json.load(json_file)
     44 + 
     45 + tmp = pd.DataFrame(jsonfile['results'])
     46 + 
     47 + if verbose == True:
     48 + print("\n\n json file (before preprocessing) = ")
     49 + print(tmp)
     50 + 
     51 + tmp['td'] = tmp['t'] + ' ' + tmp['d']
     52 + self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
     53 + 
     54 + else:
     55 + print("ERROR, only accept csv or json file!")
     56 + 
     57 + if verbose == True:
     58 + print("\n\n number of corpus = ")
     59 + print(len(self.corpus))
     60 + print("\n\n self.corpus[0] = ")
     61 + print(self.corpus[0])
     62 + print("\n\n all self.corpus = ")
     63 + print(self.corpus)
     64 + 
     65 + if showcharts == True:
     66 + print("\n\n histogram of the number of words in each corpus")
     67 + pd.Series([len(e.split()) for e in self.corpus]).hist()
     68 + plt.show()
     69 + 
     70 + def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
     71 + 
     72 + pretrained_model = selected_pretrained_model
     73 + if verbose == True:
     74 + print("\n\n Model selection")
     75 + # https://www.sbert.net/docs/pretrained_models.html
     76 + print(pretrained_model)
     77 + 
     78 + model = SentenceTransformer(pretrained_model)
     79 + if verbose == True:
     80 + print(model)
     81 + 
     82 + corpus_embeddings = model.encode(self.corpus)
     83 + if verbose == True:
     84 + print("\n\n CORPUS EMBEDDING")
     85 + print(corpus_embeddings.shape)
     86 + print(corpus_embeddings)
     87 + 
     88 + K = 5
     89 + kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings)
     90 + if verbose == True:
     91 + print("\n\n Show Cluster using SkLearn KMeans")
     92 + print(kmeans)
     93 + 
     94 + corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus})
     95 + print("\n\n corpus_labeled = ")
     96 + print(corpus_labeled)
     97 + 
     98 + cls_dist = pd.Series(kmeans.labels_).value_counts()
     99 + if verbose == True:
     100 + print("\n\n frequency of cluster label = ")
     101 + print(cls_dist)
     102 + 
     103 + distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings)
     104 + if verbose == True:
     105 + print("\n\n calculate distance of cluster's center point = ")
     106 + print(distances)
     107 + 
     108 + print("\n\n Cluster's center example = ")
     109 + 
     110 + centers = {}
     111 + print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t")
     112 + for i, d in enumerate(distances):
     113 + ind = np.argsort(d, axis=0)[0]
     114 + centers[i] = ind
     115 + print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t")
     116 + 
     117 + if showcharts == True:
     118 + print("\n\n Visualization of the cluster points")
     119 + 
     120 + X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings)
     121 + labels = kmeans.labels_
     122 + 
     123 + fig, ax = plt.subplots(figsize=(12, 8))
     124 + plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired')
     125 + for c in centers:
     126 + plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24)
     127 + plt.colorbar()
     128 + plt.show()
     129 + 
     130 + def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose):
     131 + 
     132 + pretrained_model = selected_pretrained_model
     133 + if verbose == True:
     134 + print("\n\n Model selection")
     135 + # https://www.sbert.net/docs/pretrained_models.html
     136 + print(pretrained_model)
     137 + 
     138 + model = SentenceTransformer(pretrained_model)
     139 + if verbose == True:
     140 + print(model)
     141 + 
     142 + corpus_embeddings = model.encode(self.corpus)
     143 + if verbose == True:
     144 + print("\n\n CORPUS EMBEDDING")
     145 + print(corpus_embeddings.shape)
     146 + print(corpus_embeddings)
     147 + 
     148 + print("\n\n Topic Modeling with BERTopic")
     149 + 
     150 + sentence_model = SentenceTransformer(pretrained_model)
     151 + if verbose == True:
     152 + print(sentence_model)
     153 + 
     154 + topic_model = BERTopic(embedding_model=sentence_model)
     155 + if verbose == True:
     156 + print(topic_model)
     157 + 
     158 + topics, _ = topic_model.fit_transform(self.corpus)
     159 + print(topic_model.get_topic_info()[:6])
     160 + 
     161 + corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus})
     162 + if verbose == True:
     163 + print("\n\n corpus_labeled = ")
     164 + print(corpus_labeled)
     165 + 
     166 + print("\n\n topics for each cluster = ")
     167 + 
     168 + i = 0
     169 + while i < len(topic_model.get_topic_info()):
     170 + print("Cluster #" + str(i) + " = ")
     171 + print(topic_model.get_topic(i))
     172 + i += 1
     173 + 
     174 + 
  • ■ ■ ■ ■ ■ ■
    maryam/modules/iris/topicmodeling.py
     1 +# TESTED USING =
     2 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
     3 +# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
     4 +# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
     5 +# topicmodeling -i mixed.json -t json -v -m all-distilroberta-v1
     6 +# topicmodeling -i testdataset.csv -t csv -m all-mpnet-base-v2
     7 +# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
     8 +# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
     9 +# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
     10 +# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
     11 + 
     12 + 
     13 +meta = {
     14 + 'name': 'Topic Modeling',
     15 + 'author': 'Hatma Suryotrisongko',
     16 + 'version': '0.1.0',
     17 + 'description': 'Topic Modeling Algorithms.',
     18 + 'options': (
     19 + ('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
     20 + ('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
     21 + ('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
     22 + ('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
     23 + ('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
     24 + ),
     25 + 'examples': ('topicmodeling -i mixed.json -t json -s True -v False -m all-mpnet-base-v2')
     26 +}
     27 + 
     28 + 
     29 +def module_api(self):
     30 + 
     31 + run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['showcharts'], self.options['verbose'])
     32 + run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
     33 + run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
     34 + 
     35 + 
     36 +def module_run(self):
     37 + module_api(self)
Please wait...
Page is in error, reload to recover