-
Notifications
You must be signed in to change notification settings - Fork 285
Expand file tree
/
Copy pathdemo_pair_plot_category_focused.py
More file actions
27 lines (22 loc) · 999 Bytes
/
demo_pair_plot_category_focused.py
File metadata and controls
27 lines (22 loc) · 999 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
import scattertext as st
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
corpus = st.CorpusFromScikit(
X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
y=newsgroups_train.target,
feature_vocabulary=vectorizer.vocabulary_,
category_names=newsgroups_train.target_names,
raw_texts=newsgroups_train.data
).build().get_unigram_corpus()
html = st.produce_category_focused_pairplot(
corpus=corpus,
category_projector=st.CategoryProjector(projector=PCA(10)),
category='alt.atheism'
)
file_name = 'demo_pair_plot_category_focused.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('Open ./%s in Chrome.' % (file_name))