Projects STRLCPY Maryam Commits fb460ae2
🤬
Revision indexing in progress... (symbol navigation in revisions will be accurate after indexed)
  • ■ ■ ■ ■
    maryam/core/util/iris/plotly.py maryam/core/util/iris/plotlyutil.py
    skipped 11 lines
    12 12  along with this program. If not, see <http://www.gnu.org/licenses/>.
    13 13  """
    14 14   
    15  -import plotly
     15 +import plotlyutil
    16 16   
    17 17  class main:
    18 18   
    skipped 20 lines
  • ■ ■ ■ ■ ■ ■
    maryam/core/util/iris/topic.py
     1 +# core/util/iris/topic.py
     2 +# Hatma Suryotrisongko
     3 + 
     4 +import pandas as pd
     5 +import numpy as np
     6 +import matplotlib.pyplot as plt
     7 +from sentence_transformers import SentenceTransformer
     8 + 
     9 +class main:
     10 + 
     11 + def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
     12 + 
     13 + from dask import dataframe as dd
     14 + import json
     15 + from gensim.parsing.preprocessing import remove_stopwords
     16 + 
     17 + if verbose == True:
     18 + print("\n\n DATASET = reading file : " + inputfile)
     19 + print("\n\n Search keyword = " + keyword)
     20 + 
     21 + if filetype == "csv":
     22 + # tmp = pd.read_csv(inputfile, header=None, low_memory=False)
     23 + tmp = dd.read_csv(inputfile, sep=';', header=None)
     24 + tmp2 = tmp.to_dask_array(lengths=True)
     25 + tmp3 = tmp2.compute()
     26 + tmp4 = pd.DataFrame(tmp3)
     27 + 
     28 + if verbose == True:
     29 + print("\n\n csv file (before preprocessing) = ")
     30 + print(tmp4)
     31 + 
     32 + self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
     33 + 
     34 + elif filetype == "json":
     35 + with open(inputfile) as json_file:
     36 + jsonfile = json.load(json_file)
     37 + 
     38 + tmp = pd.DataFrame(jsonfile['results'])
     39 + 
     40 + if verbose == True:
     41 + print("\n\n json file (before preprocessing) = ")
     42 + print(tmp)
     43 + 
     44 + tmp['td'] = tmp['t'] + ' ' + tmp['d']
     45 + self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
     46 + 
     47 + else:
     48 + print("ERROR, only accept csv or json file!")
     49 + 
     50 + if verbose == True:
     51 + print("\n\n number of corpus = ")
     52 + print(len(self.corpus))
     53 + print("\n\n self.corpus[0] = ")
     54 + print(self.corpus[0])
     55 + print("\n\n all self.corpus = ")
     56 + print(self.corpus)
     57 + 
     58 + if showcharts == True:
     59 + print("\n\n histogram of the number of words in each corpus")
     60 + pd.Series([len(e.split()) for e in self.corpus]).hist()
     61 + plt.show()
     62 + 
     63 + def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
     64 + 
     65 + from sklearn.cluster import KMeans
     66 + import scipy
     67 + import umap
     68 + 
     69 + pretrained_model = selected_pretrained_model
     70 + if verbose == True:
     71 + print("\n\n Model selection")
     72 + # https://www.sbert.net/docs/pretrained_models.html
     73 + print(pretrained_model)
     74 + 
     75 + model = SentenceTransformer(pretrained_model)
     76 + if verbose == True:
     77 + print(model)
     78 + 
     79 + corpus_embeddings = model.encode(self.corpus)
     80 + if verbose == True:
     81 + print("\n\n CORPUS EMBEDDING")
     82 + print(corpus_embeddings.shape)
     83 + print(corpus_embeddings)
     84 + 
     85 + K = 5
     86 + kmeans = KMeans(n_clusters=5, random_state=0).fit(corpus_embeddings)
     87 + if verbose == True:
     88 + print("\n\n Show Cluster using SkLearn KMeans")
     89 + print(kmeans)
     90 + 
     91 + corpus_labeled = pd.DataFrame({'ClusterLabel': kmeans.labels_, 'Sentence': self.corpus})
     92 + print("\n\n corpus_labeled = ")
     93 + print(corpus_labeled)
     94 + 
     95 + cls_dist = pd.Series(kmeans.labels_).value_counts()
     96 + if verbose == True:
     97 + print("\n\n frequency of cluster label = ")
     98 + print(cls_dist)
     99 + 
     100 + distances = scipy.spatial.distance.cdist(kmeans.cluster_centers_, corpus_embeddings)
     101 + if verbose == True:
     102 + print("\n\n calculate distance of cluster's center point = ")
     103 + print(distances)
     104 + 
     105 + print("\n\n Cluster's center example = ")
     106 + 
     107 + centers = {}
     108 + print("Cluster", "Size", "Center-idx", "Center-Example", sep="\t\t")
     109 + for i, d in enumerate(distances):
     110 + ind = np.argsort(d, axis=0)[0]
     111 + centers[i] = ind
     112 + print(i, cls_dist[i], ind, self.corpus[ind], sep="\t\t")
     113 + 
     114 + if showcharts == True:
     115 + print("\n\n Visualization of the cluster points")
     116 + 
     117 + X = umap.UMAP(n_components=2, min_dist=0.0).fit_transform(corpus_embeddings)
     118 + labels = kmeans.labels_
     119 + 
     120 + fig, ax = plt.subplots(figsize=(12, 8))
     121 + plt.scatter(X[:, 0], X[:, 1], c=labels, s=1, cmap='Paired')
     122 + for c in centers:
     123 + plt.text(X[centers[c], 0], X[centers[c], 1], "CLS-" + str(c), fontsize=24)
     124 + plt.colorbar()
     125 + plt.show()
     126 + 
     127 + def run_topic_modeling_bertopic(self, selected_pretrained_model, verbose):
     128 + 
     129 + from bertopic import BERTopic
     130 + 
     131 + pretrained_model = selected_pretrained_model
     132 + if verbose == True:
     133 + print("\n\n Model selection")
     134 + # https://www.sbert.net/docs/pretrained_models.html
     135 + print(pretrained_model)
     136 + 
     137 + model = SentenceTransformer(pretrained_model)
     138 + if verbose == True:
     139 + print(model)
     140 + 
     141 + corpus_embeddings = model.encode(self.corpus)
     142 + if verbose == True:
     143 + print("\n\n CORPUS EMBEDDING")
     144 + print(corpus_embeddings.shape)
     145 + print(corpus_embeddings)
     146 + 
     147 + print("\n\n Topic Modeling with BERTopic")
     148 + 
     149 + sentence_model = SentenceTransformer(pretrained_model)
     150 + if verbose == True:
     151 + print(sentence_model)
     152 + 
     153 + topic_model = BERTopic(embedding_model=sentence_model)
     154 + if verbose == True:
     155 + print(topic_model)
     156 + 
     157 + topics, _ = topic_model.fit_transform(self.corpus)
     158 + print(topic_model.get_topic_info()[:6])
     159 + output = topic_model.get_topic_info()
     160 + 
     161 + corpus_labeled = pd.DataFrame({'ClusterLabel': topics, 'Sentence': self.corpus})
     162 + if verbose == True:
     163 + print("\n\n corpus_labeled = ")
     164 + print(corpus_labeled)
     165 + 
     166 + print("\n\n topics for each cluster = ")
     167 + 
     168 + i = 0
     169 + while i < len(topic_model.get_topic_info()):
     170 + print("Cluster #" + str(i) + " = ")
     171 + print(topic_model.get_topic(i))
     172 + i += 1
     173 + 
     174 + return output
     175 + 
     176 + 
     177 + def run_search_topics_top2vec(self, keyword, showcharts, verbose):
     178 + 
     179 + from top2vec import Top2Vec
     180 + 
     181 + print("\n\n Search Topics Using Top2Vec (caution: might not work well for a small dataset)")
     182 + print("\n the Search Keyword = " + keyword)
     183 + 
     184 + pretrained_embedding_model = "universal-sentence-encoder-multilingual"
     185 + if verbose == True:
     186 + print("\n\n Pretrained Embedding Model")
     187 + # https://tfhub.dev/google/universal-sentence-encoder-multilingual/
     188 + # 16 languages (Arabic, Chinese-simplified, Chinese-traditional, English, French, German, Italian, Japanese, Korean, Dutch, Polish, Portuguese, Spanish, Thai, Turkish, Russian) text encoder.
     189 + print(pretrained_embedding_model)
     190 + 
     191 + model = Top2Vec(documents=self.corpus.tolist(), speed="learn", workers=8)
     192 + if verbose == True:
     193 + print("\n Model = ")
     194 + print(model)
     195 + 
     196 + if model.get_num_topics() < 5:
     197 + ntopics = model.get_num_topics()
     198 + else:
     199 + ntopics = 5
     200 + 
     201 + topic_words, word_scores, topic_nums = model.get_topics(ntopics)
     202 + print(topic_words)
     203 + print(word_scores)
     204 + print(topic_nums)
     205 + 
     206 + print("\n Semantic Search Documents by Keywords = ")
     207 + documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=[keyword], num_docs=5)
     208 + for doc, score, doc_id in zip(documents, document_scores, document_ids):
     209 + print(f"Document: {doc_id}, Score: {score}")
     210 + print("-----------")
     211 + print(doc)
     212 + print("-----------")
     213 + print()
     214 + 
     215 + if showcharts == True:
     216 + print("\n\n Generate Word Clouds = ")
     217 + topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=[keyword], num_topics=ntopics)
     218 + for topic in topic_nums:
     219 + model.generate_topic_wordcloud(topic)
     220 + 
     221 + print("\n Similar Keywords = ")
     222 + words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20)
     223 + for word, score in zip(words, word_scores):
     224 + print(f"{word} {score}")
     225 + 
  • ■ ■ ■ ■ ■ ■
    maryam/modules/iris/topicmodeling.py
     1 +# modules/iris/topicmodeling.py
     2 +# # TESTING The Main Function =
     3 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1
     4 +# topicmodeling -i mixed.json -t json -s -v -m all-distilroberta-v1
     5 +# topicmodeling -i mixed.json -t json -s -m all-distilroberta-v1
     6 +# topicmodeling -i mixed.json -t json -v -m all-distilroberta-v1
     7 +# topicmodeling -i testdataset.csv -t csv -m all-mpnet-base-v2
     8 +# topicmodeling -i testdataset.csv -t csv -s -v -m all-mpnet-base-v2
     9 +# topicmodeling -i testdataset.csv -t csv -s -m all-mpnet-base-v2
     10 +# topicmodeling -i testdataset.csv -t csv -v -m all-mpnet-base-v2
     11 +# # TESTING The Reporting & API Function =
     12 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --output
     13 +# report json testreport iris/topicmodeling
     14 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 --api
     15 +# # TESTING The Function: Search Topics Using a Keyword (Top2Vec) =
     16 +# topicmodeling -i mixed.json -t json -m all-distilroberta-v1 -k music
     17 +# Note: download the dataset for testing from https://github.com/keamanansiber/Maryam/tree/master/notebooks
     18 +# Hatma Suryotrisongko
     19 + 
     20 + 
     21 +meta = {
     22 + 'name': 'Topic Modeling',
     23 + 'author': 'Hatma Suryotrisongko',
     24 + 'version': '0.1.0',
     25 + 'description': 'Topic Modeling Algorithms.',
     26 + 'required': ('dask', 'scikit-learn', 'umap', 'bertopic', 'gensim', 'top2vec'),
     27 + 'options': (
     28 + ('inputfile', None, True, 'Input file that contains the data', '-i', 'store', str),
     29 + ('filetype', None, True, 'File type: csv/json', '-t', 'store', str),
     30 + ('keyword', None, False, 'Search keyword: ', '-k', 'store', str),
     31 + ('showcharts', None, False, 'Show charts?', '-s', 'store_true', bool),
     32 + ('verbose', None, False, 'Verbose output?', '-v', 'store_true', bool),
     33 + ('pretrained_model', None, True, 'model for embedding', '-m', 'store', str),
     34 + ),
     35 + 'examples': ('topicmodeling -i mixed.json -t json -k music -s True -v False -m all-mpnet-base-v2')
     36 +}
     37 + 
     38 + 
     39 +def module_api(self):
     40 + 
     41 + run = self.topic(self.options['inputfile'], self.options['filetype'], self.options['keyword'], self.options['showcharts'], self.options['verbose'])
     42 + run.run_sklearn_cluster_kmeans(self.options['pretrained_model'], self.options['showcharts'], self.options['verbose'])
     43 + 
     44 + results = run.run_topic_modeling_bertopic(self.options['pretrained_model'], self.options['verbose'])
     45 + self.output("\n\nResults = \n")
     46 + self.output( results )
     47 + 
     48 + output = {'results': results.to_json(orient="records") }
     49 + 
     50 + inputfile = self.options['inputfile']
     51 + self.save_gather(output, 'iris/topicmodeling', inputfile, output=self.options['output'])
     52 + 
     53 + if self.options['keyword'] is not None:
     54 + self.output(" keyword = " + self.options['keyword'])
     55 + run.run_search_topics_top2vec(self.options['keyword'], self.options['showcharts'], self.options['verbose'])
     56 + 
     57 + return output
     58 + 
     59 + 
     60 +def module_run(self):
     61 + output = module_api(self)
     62 + self.output("\n\nOutput = \n")
     63 + self.output( output )
  • ■ ■ ■ ■ ■
    setup.py
    skipped 43 lines
    44 44   'nltk',
    45 45   'matplotlib',
    46 46   'pandas',
    47  - 'wordcloud'
     47 + 'wordcloud',
     48 + 'numpy',
     49 + 'dask',
     50 + 'scikit-learn',
     51 + 'scipy',
     52 + 'umap',
     53 + 'bertopic',
     54 + 'sentence_transformers',
     55 + 'gensim',
     56 + 'top2vec'
    48 57   ],
    49 58   classifiers=[
    50 59   'Programming Language :: Python :: 3.10',
    skipped 8 lines
Please wait...
Page is in error, reload to recover