""" ======================================================== Topics extraction with Non-Negative Matrix Factorization ======================================================== This is a proof of concept application of Non Negative Matrix Factorization of the term frequency matrix of a corpus of documents so as to extract an additive model of the topic structure of the corpus. The output is a list of topics, each represented as a list of terms (weights are not shown). The default parameters (n_samples / n_features / n_topics) should make the example runnable in a couple of tens of seconds. You can try to increase the dimensions of the problem, but be aware than the time complexity is polynomial. """ # Author: Olivier Grisel # Lars Buitinck # License: BSD 3 clause from __future__ import print_function from time import time from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from sklearn.datasets import fetch_20newsgroups n_samples = 2000 n_features = 1000 n_topics = 10 n_top_words = 20 # Load the 20 newsgroups dataset and vectorize it. We use a few heuristics # to filter out useless terms early on: the posts are stripped of headers, # footers and quoted replies, and common English words, words occurring in # only one document or in at least 95% of the documents are removed. t0 = time() print("Loading dataset and extracting TF-IDF features...") dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tfidf = vectorizer.fit_transform(dataset.data[:n_samples]) print("done in %0.3fs." % (time() - t0)) # Fit the NMF model print("Fitting the NMF model with n_samples=%d and n_features=%d..." % (n_samples, n_features)) nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf) print("done in %0.3fs." % (time() - t0)) feature_names = vectorizer.get_feature_names() for topic_idx, topic in enumerate(nmf.components_): print("Topic #%d:" % topic_idx) print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print()