■ ■ ■ ■ ■ ■
maryam/core/util/iris/topic.py
| 1 | + | # core/util/iris/topicmodeling.py |
| 2 | + | # Based on Hatma Suryotrisongko's prototype = https://github.com/keamanansiber/Maryam/blob/master/notebooks/Prototype_4_TopicModeling_0_1_0_CsvFile_Options_StopwordsRemoval_27062022.ipynb |
| 3 | + | |
| 4 | + | import pandas as pd |
| 5 | + | import numpy as np |
| 6 | + | import json |
| 7 | + | import csv |
| 8 | + | from dask import dataframe as dd |
| 9 | + | |
| 10 | + | from sklearn.cluster import KMeans |
| 11 | + | import scipy |
| 12 | + | import matplotlib.pyplot as plt |
| 13 | + | import umap |
| 14 | + | |
| 15 | + | from bertopic import BERTopic |
| 16 | + | from sentence_transformers import SentenceTransformer |
| 17 | + | |
| 18 | + | from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS |
| 19 | + | |
| 20 | + | |
| 21 | + | class main: |
| 22 | + | |
| 23 | + | def __init__(self, inputfile, filetype, showcharts, verbose): |
| 24 | + | |
| 25 | + | if verbose == True: |
| 26 | + | print("\n\n DATASET = reading file : " + inputfile) |
| 27 | + | |
| 28 | + | if filetype == "csv": |
| 29 | + | # tmp = pd.read_csv(inputfile, header=None, low_memory=False) |
| 30 | + | tmp = dd.read_csv(inputfile, sep=';', header=None) |
| 31 | + | tmp2 = tmp.to_dask_array(lengths=True) |
| 32 | + | tmp3 = tmp2.compute() |
| 33 | + | tmp4 = pd.DataFrame(tmp3) |
| 34 | + | |
| 35 | + | if verbose == True: |
| 36 | + | print("\n\n csv file (before preprocessing) = ") |
| 37 | + | print(tmp4) |
| 38 | + | |
| 39 | + | self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy() |
| 40 | + | |
| 41 | + | elif filetype == "json": |
| 42 | + | with open(inputfile) as json_file: |
| 43 | + | jsonfile = json.load(json_file) |
| 44 | + | |
| 45 | + | tmp = pd.DataFrame(jsonfile['results']) |
| 46 | + | |
| 47 | + | if verbose == True: |
| 48 | + | print("\n\n json file (before preprocessing) = ") |
| 49 | + | print(tmp) |
| 50 | + | |
| 51 | + | tmp['td'] = tmp['t'] + ' ' + tmp['d'] |
| 52 | + | self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy() |
| 53 | + | |
| 54 | + | else: |
| 55 | + | print("ERROR, only accept csv or json file!") |
| 56 | + | |
| 57 | + | if verbose == True: |
| 58 | + | print("\n\n number of corpus = ") |
| 59 | + | print(len(self.corpus)) |
| 60 | + | print("\n\n self.corpus[0] = ") |
| 61 | + | print(self.corpus[0]) |
| 62 | + | print("\n\n all self.corpus = ") |
| 63 | + | print(self.corpus) |
| 64 | + | |
| 65 | + | if showcharts == True: |
| 66 | + | print("\n\n histogram of the number of words in each corpus") |
| 67 | + | pd.Series([len(e.split()) for e in self.corpus]).hist() |
| 68 | + | plt.show() |
| 69 | + | |
| 70 | + | def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose): |
| 71 | + | |
| 72 | + | pretrained_model = selected_pretrained_model |
| 73 | + | if verbose == True: |
| 74 | + | print("\n\n Model selection") |
| 75 | + | # https://www.sbert.net/docs/pretrained_models.html |
| 76 | + | print(pretrained_model) |
| 77 | + | |
| 78 | + | model = SentenceTransformer(pretrained_model) |
| 79 | + | if verbose == True: |
| 80 | + | print(model) |
| 81 | + | |
| 82 | + | corpus_embeddings = model.encode(self.corpus) |
| 83 | + | if verbose == True: |
| 84 | + | print("\n\n CORPUS EMBEDDING") |
| 85 | + | print(corpus_embeddings.shape) |
| 86 | + | print(corpus_embeddings) |
| 87 | + | |
| 88 | + | K = 5 |
| 89 | + | kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings) |
| 90 | + | if verbose == True: |
| 91 | + | print("\n\n Show Cluster using SkLearn KMeans") |
| 92 | + | print(kmeans) |
| 93 | + | |
| 94 | + | corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus}) |
| 95 | + | print("\n\n corpus_labeled = ") |
| 96 | + | print(corpus_labeled) |
| 97 | + | |
| 98 | + | cls_dist = pd.Series(kmeans.labels_).value_counts() |
| 99 | + | if verbose == True: |
| 100 | + | print("\n\n frequency of cluster label = ") |
| 101 | + | print(cls_dist) |
| 102 | + | |
| 103 | + | distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings) |
| 104 | + | if verbose == True: |
| 105 | + | print("\n\n calculate distance of cluster's center point = ") |
| 106 | + | print(distances) |
| 107 | + | |
| 108 | + | print("\n\n Cluster's center example = ") |
| 109 | + | |
| 110 | + | centers = {} |
| 111 | + | print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t") |
| 112 | + | for i, d in enumerate(distances): |
| 113 | + | ind = np.argsort(d, axis=0)[0] |
| 114 | + | centers[i] = ind |
| 115 | + | print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t") |
| 116 | + | |
| 117 | + | if showcharts == True: |
| 118 | + | print("\n\n Visualization of the cluster points") |
| 119 | + | |
| 120 | + | X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings) |
| 121 | + | labels = kmeans.labels_ |
| 122 | + | |
| 123 | + | fig, ax = plt.subplots(figsize=(12, 8)) |
| 124 | + | plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired') |
| 125 | + | for c in centers: |
| 126 | + | plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24) |
| 127 | + | plt.colorbar() |
| 128 | + | plt.show() |
| 129 | + | |
| 130 | + | def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose): |
| 131 | + | |
| 132 | + | pretrained_model = selected_pretrained_model |
| 133 | + | if verbose == True: |
| 134 | + | print("\n\n Model selection") |
| 135 | + | # https://www.sbert.net/docs/pretrained_models.html |
| 136 | + | print(pretrained_model) |
| 137 | + | |
| 138 | + | model = SentenceTransformer(pretrained_model) |
| 139 | + | if verbose == True: |
| 140 | + | print(model) |
| 141 | + | |
| 142 | + | corpus_embeddings = model.encode(self.corpus) |
| 143 | + | if verbose == True: |
| 144 | + | print("\n\n CORPUS EMBEDDING") |
| 145 | + | print(corpus_embeddings.shape) |
| 146 | + | print(corpus_embeddings) |
| 147 | + | |
| 148 | + | print("\n\n Topic Modeling with BERTopic") |
| 149 | + | |
| 150 | + | sentence_model = SentenceTransformer(pretrained_model) |
| 151 | + | if verbose == True: |
| 152 | + | print(sentence_model) |
| 153 | + | |
| 154 | + | topic_model = BERTopic(embedding_model=sentence_model) |
| 155 | + | if verbose == True: |
| 156 | + | print(topic_model) |
| 157 | + | |
| 158 | + | topics, _ = topic_model.fit_transform(self.corpus) |
| 159 | + | print(topic_model.get_topic_info()[:6]) |
| 160 | + | |
| 161 | + | corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus}) |
| 162 | + | if verbose == True: |
| 163 | + | print("\n\n corpus_labeled = ") |
| 164 | + | print(corpus_labeled) |
| 165 | + | |
| 166 | + | print("\n\n topics for each cluster = ") |
| 167 | + | |
| 168 | + | i = 0 |
| 169 | + | while i < len(topic_model.get_topic_info()): |
| 170 | + | print("Cluster #" + str(i) + " = ") |
| 171 | + | print(topic_model.get_topic(i)) |
| 172 | + | i += 1 |
| 173 | + | |
| 174 | + | |