| 1 | + | # core/util/iris/topic.py |
| 2 | + | # Hatma Suryotrisongko |
| 3 | + | |
| 4 | + | import pandas as pd |
| 5 | + | import numpy as np |
| 6 | + | import matplotlib.pyplot as plt |
| 7 | + | from sentence_transformers import SentenceTransformer |
| 8 | + | |
| 9 | + | class main: |
| 10 | + | |
| 11 | + | def __init__(self, inputfile, filetype, keyword, showcharts, verbose): |
| 12 | + | |
| 13 | + | from dask import dataframe as dd |
| 14 | + | import json |
| 15 | + | from gensim.parsing.preprocessing import remove_stopwords |
| 16 | + | |
| 17 | + | if verbose == True: |
| 18 | + | print("\n\n DATASET = reading file : " + inputfile) |
| 19 | + | print("\n\n Search keyword = " + keyword) |
| 20 | + | |
| 21 | + | if filetype == "csv": |
| 22 | + | # tmp = pd.read_csv(inputfile, header=None, low_memory=False) |
| 23 | + | tmp = dd.read_csv(inputfile, sep=';', header=None) |
| 24 | + | tmp2 = tmp.to_dask_array(lengths=True) |
| 25 | + | tmp3 = tmp2.compute() |
| 26 | + | tmp4 = pd.DataFrame(tmp3) |
| 27 | + | |
| 28 | + | if verbose == True: |
| 29 | + | print("\n\n csv file (before preprocessing) = ") |
| 30 | + | print(tmp4) |
| 31 | + | |
| 32 | + | self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy() |
| 33 | + | |
| 34 | + | elif filetype == "json": |
| 35 | + | with open(inputfile) as json_file: |
| 36 | + | jsonfile = json.load(json_file) |
| 37 | + | |
| 38 | + | tmp = pd.DataFrame(jsonfile['results']) |
| 39 | + | |
| 40 | + | if verbose == True: |
| 41 | + | print("\n\n json file (before preprocessing) = ") |
| 42 | + | print(tmp) |
| 43 | + | |
| 44 | + | tmp['td'] = tmp['t'] + ' ' + tmp['d'] |
| 45 | + | self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy() |
| 46 | + | |
| 47 | + | else: |
| 48 | + | print("ERROR, only accept csv or json file!") |
| 49 | + | |
| 50 | + | if verbose == True: |
| 51 | + | print("\n\n number of corpus = ") |
| 52 | + | print(len(self.corpus)) |
| 53 | + | print("\n\n self.corpus[0] = ") |
| 54 | + | print(self.corpus[0]) |
| 55 | + | print("\n\n all self.corpus = ") |
| 56 | + | print(self.corpus) |
| 57 | + | |
| 58 | + | if showcharts == True: |
| 59 | + | print("\n\n histogram of the number of words in each corpus") |
| 60 | + | pd.Series([len(e.split()) for e in self.corpus]).hist() |
| 61 | + | plt.show() |
| 62 | + | |
| 63 | + | def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose): |
| 64 | + | |
| 65 | + | from sklearn.cluster import KMeans |
| 66 | + | import scipy |
| 67 | + | import umap |
| 68 | + | |
| 69 | + | pretrained_model = selected_pretrained_model |
| 70 | + | if verbose == True: |
| 71 | + | print("\n\n Model selection") |
| 72 | + | # https://www.sbert.net/docs/pretrained_models.html |
| 73 | + | print(pretrained_model) |
| 74 | + | |
| 75 | + | model = SentenceTransformer(pretrained_model) |
| 76 | + | if verbose == True: |
| 77 | + | print(model) |
| 78 | + | |
| 79 | + | corpus_embeddings = model.encode(self.corpus) |
| 80 | + | if verbose == True: |
| 81 | + | print("\n\n CORPUS EMBEDDING") |
| 82 | + | print(corpus_embeddings.shape) |
| 83 | + | print(corpus_embeddings) |
| 84 | + | |
| 85 | + | K = 5 |
| 86 | + | kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings) |
| 87 | + | if verbose == True: |
| 88 | + | print("\n\n Show Cluster using SkLearn KMeans") |
| 89 | + | print(kmeans) |
| 90 | + | |
| 91 | + | corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus}) |
| 92 | + | print("\n\n corpus_labeled = ") |
| 93 | + | print(corpus_labeled) |
| 94 | + | |
| 95 | + | cls_dist = pd.Series(kmeans.labels_).value_counts() |
| 96 | + | if verbose == True: |
| 97 | + | print("\n\n frequency of cluster label = ") |
| 98 | + | print(cls_dist) |
| 99 | + | |
| 100 | + | distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings) |
| 101 | + | if verbose == True: |
| 102 | + | print("\n\n calculate distance of cluster's center point = ") |
| 103 | + | print(distances) |
| 104 | + | |
| 105 | + | print("\n\n Cluster's center example = ") |
| 106 | + | |
| 107 | + | centers = {} |
| 108 | + | print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t") |
| 109 | + | for i, d in enumerate(distances): |
| 110 | + | ind = np.argsort(d, axis=0)[0] |
| 111 | + | centers[i] = ind |
| 112 | + | print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t") |
| 113 | + | |
| 114 | + | if showcharts == True: |
| 115 | + | print("\n\n Visualization of the cluster points") |
| 116 | + | |
| 117 | + | X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings) |
| 118 | + | labels = kmeans.labels_ |
| 119 | + | |
| 120 | + | fig, ax = plt.subplots(figsize=(12, 8)) |
| 121 | + | plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired') |
| 122 | + | for c in centers: |
| 123 | + | plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24) |
| 124 | + | plt.colorbar() |
| 125 | + | plt.show() |
| 126 | + | |
| 127 | + | def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose): |
| 128 | + | |
| 129 | + | from bertopic import BERTopic |
| 130 | + | |
| 131 | + | pretrained_model = selected_pretrained_model |
| 132 | + | if verbose == True: |
| 133 | + | print("\n\n Model selection") |
| 134 | + | # https://www.sbert.net/docs/pretrained_models.html |
| 135 | + | print(pretrained_model) |
| 136 | + | |
| 137 | + | model = SentenceTransformer(pretrained_model) |
| 138 | + | if verbose == True: |
| 139 | + | print(model) |
| 140 | + | |
| 141 | + | corpus_embeddings = model.encode(self.corpus) |
| 142 | + | if verbose == True: |
| 143 | + | print("\n\n CORPUS EMBEDDING") |
| 144 | + | print(corpus_embeddings.shape) |
| 145 | + | print(corpus_embeddings) |
| 146 | + | |
| 147 | + | print("\n\n Topic Modeling with BERTopic") |
| 148 | + | |
| 149 | + | sentence_model = SentenceTransformer(pretrained_model) |
| 150 | + | if verbose == True: |
| 151 | + | print(sentence_model) |
| 152 | + | |
| 153 | + | topic_model = BERTopic(embedding_model=sentence_model) |
| 154 | + | if verbose == True: |
| 155 | + | print(topic_model) |
| 156 | + | |
| 157 | + | topics, _ = topic_model.fit_transform(self.corpus) |
| 158 | + | print(topic_model.get_topic_info()[:6]) |
| 159 | + | output = topic_model.get_topic_info() |
| 160 | + | |
| 161 | + | corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus}) |
| 162 | + | if verbose == True: |
| 163 | + | print("\n\n corpus_labeled = ") |
| 164 | + | print(corpus_labeled) |
| 165 | + | |
| 166 | + | print("\n\n topics for each cluster = ") |
| 167 | + | |
| 168 | + | i = 0 |
| 169 | + | while i < len(topic_model.get_topic_info()): |
| 170 | + | print("Cluster #" + str(i) + " = ") |
| 171 | + | print(topic_model.get_topic(i)) |
| 172 | + | i += 1 |
| 173 | + | |
| 174 | + | return output |
| 175 | + | |
| 176 | + | |
| 177 | + | def run_search_topics_top2vec(self, keyword, showcharts, verbose): |
| 178 | + | |
| 179 | + | from top2vec import Top2Vec |
| 180 | + | |
| 181 | + | print("\n\n Search Topics Using Top2Vec (caution: might not work well for a small dataset)") |
| 182 | + | print("\n the Search Keyword = " + keyword) |
| 183 | + | |
| 184 | + | pretrained_embedding_model = "universal-sentence-encoder-multilingual" |
| 185 | + | if verbose == True: |
| 186 | + | print("\n\n Pretrained Embedding Model") |
| 187 | + | # https://tfhub.dev/google/universal-sentence-encoder-multilingual/ |
| 188 | + | # 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) text encoder. |
| 189 | + | print(pretrained_embedding_model) |
| 190 | + | |
| 191 | + | model = Top2Vec(documents=self.corpus.tolist(), speed="learn", workers=8) |
| 192 | + | if verbose == True: |
| 193 | + | print("\n Model = ") |
| 194 | + | print(model) |
| 195 | + | |
| 196 | + | if model.get_num_topics() < 5: |
| 197 | + | ntopics = model.get_num_topics() |
| 198 | + | else: |
| 199 | + | ntopics = 5 |
| 200 | + | |
| 201 | + | topic_words, word_scores, topic_nums = model.get_topics(ntopics) |
| 202 | + | print(topic_words) |
| 203 | + | print(word_scores) |
| 204 | + | print(topic_nums) |
| 205 | + | |
| 206 | + | print("\n Semantic Search Documents by Keywords = ") |
| 207 | + | documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=[keyword], num_docs=5) |
| 208 | + | for doc, score, doc_id in zip(documents, document_scores, document_ids): |
| 209 | + | print(f"Document: {doc_id}, Score: {score}") |
| 210 | + | print("-----------") |
| 211 | + | print(doc) |
| 212 | + | print("-----------") |
| 213 | + | print() |
| 214 | + | |
| 215 | + | if showcharts == True: |
| 216 | + | print("\n\n Generate Word Clouds = ") |
| 217 | + | topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[keyword], num_topics=ntopics) |
| 218 | + | for topic in topic_nums: |
| 219 | + | model.generate_topic_wordcloud(topic) |
| 220 | + | |
| 221 | + | print("\n Similar Keywords = ") |
| 222 | + | words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20) |
| 223 | + | for word, score in zip(words, word_scores): |
| 224 | + | print(f"{word} {score}") |
| 225 | + | |