作者:欧新宇(Xinyu OU)
本文档所展示的测试结果,均运行于:Intel Core i7-7700K CPU 4.2GHz
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
en = ['The quick brown fox jumps over a lazy dog']
vect.fit(en)
print('单词数:{}'.format(len(vect.vocabulary_)))
print('分词:{}'.format(vect.vocabulary_))
vect
cn = ['那只敏捷的棕色狐狸跳过了一只懒惰的狗']
vect.fit(cn)
print('单词数:{}'.format(len(vect.vocabulary_)))
print('分词:{}'.format(vect.vocabulary_))
import jieba
cn = jieba.cut('那只敏捷的棕色狐狸跳过了一只懒惰的狗')
cn = [' '.join(cn)]
print(cn)
vect.fit(cn)
print('单词数:{}'.format(len(vect.vocabulary_)))
print('分词:{}'.format(vect.vocabulary_))
bag_of_words = vect.transform(cn)
print('转化为词袋的特征:\n{}'.format(repr(bag_of_words)))
print(bag_of_words)
print('词袋的密度表达:\n{}'.format(bag_of_words.toarray()))
cn_1 = jieba.cut('懒惰的狐狸不如敏捷的狐狸敏捷,敏捷的狐狸不如懒惰的狐狸懒惰')
cn2 = [' '.join(cn_1)]
print(cn2)
new_bag = vect.transform(cn2)
print('转化为词袋的特征:\n{}'.format(repr(new_bag)))
print('词袋的密度表达:\n{}'.format(new_bag.toarray()))
joke = jieba.cut('道士看见和尚亲吻了尼姑的嘴唇')
joke = [' '.join(joke)]
vect.fit(joke)
joke_feature = vect.transform(joke)
print('这句话的特征表达:\n{}'.format(joke_feature.toarray()))
joke2 = jieba.cut('尼姑看见道士的嘴唇亲吻了和尚')
joke2 = [' '.join(joke2)]
joke2_feature = vect.transform(joke2)
print('这句话的特征表达:\n{}'.format(joke2_feature.toarray()))
vect = CountVectorizer(ngram_range=(2,2))
cv = vect.fit(joke)
joke_feature = cv.transform(joke)
print('调整n-Gram参数后的词典:{}'.format(cv.get_feature_names()))
print('新的特征表达:{}'.format(joke_feature.toarray()))
joke2 = jieba.cut('尼姑看见道士的嘴唇亲吻了和尚')
joke2 = [' '.join(joke2)]
joke2_feature = vect.transform(joke2)
print('这句话的特征表达:\n{}'.format(joke2_feature.toarray()))
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer().fit(joke)
print(tf.get_feature_names())
!tree ACLIMDB
#请将aclImdb替换成你放置数据集的文件夹地址
from sklearn.datasets import load_files
train_set = load_files('Imdblite/train/')
X_train, y_train = train_set.data, train_set.target
print('训练集文件数量:{}'.format(len(X_train)))
print('\n随机抽一个看看:\n', X_train[22])
X_train = [doc.replace(b'<br />', b' ') for doc in X_train]
test = load_files('Imdblite/test/')
X_test, y_test = test.data, test.target
X_test = [doc.replace(b'<br />', b' ') for doc in X_test]
len(X_test)
vect = CountVectorizer().fit(X_train)
X_train_vect = vect.transform(X_train)
print('训练集样本特征数量:{}'.format(len(vect.get_feature_names())))
print('最后10个训练集样本特征:{}'.format(vect.get_feature_names()[-10:]))
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LinearSVC(), X_train_vect, y_train)
print('模型平均分:{:.3f}'.format(scores.mean()))
X_test_vect = vect.transform(X_test)
clf = LinearSVC().fit(X_train_vect, y_train)
print('测试集模型得分:{}'.format(clf.score(X_test_vect, y_test)))
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(smooth_idf = False)
tfidf.fit(X_train_vect)
X_train_tfidf = tfidf.transform(X_train_vect)
X_test_tfidf = tfidf.transform(X_test_vect)
print('未经tfidf处理的特征:\n',X_train_vect[:5,:5].toarray())
print('经过tfidf处理的特征:\n',X_train_tfidf[:5,:5].toarray())
clf = LinearSVC().fit(X_train_tfidf, y_train)
scores2 = cross_val_score(LinearSVC(), X_train_tfidf, y_train)
print('经过tf-idf处理的训练集交叉验证得分:{:.3f}'.format(scores.mean()))
print('经过tf-idf处理的测试集得分:{:.3f}'.format(clf.score(X_test_tfidf,
y_test)))
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print('停用词个数:', len(ENGLISH_STOP_WORDS))
print('列出前20个和最后20个:\n', list(ENGLISH_STOP_WORDS)[:20],
list(ENGLISH_STOP_WORDS)[-20:])
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(smooth_idf = False, stop_words = 'english')
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
scores3 = cross_val_score(LinearSVC(), X_train_tfidf, y_train)
clf.fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf.transform(X_test)
print('去掉停用词后训练集交叉验证平均分:{:.3f}'.format(scores3.mean()))
print('去掉停用词后测试集模型得分:{:.3f}'.format(clf.score(X_test_tfidf,
y_test)))