import jieba import sys import numpy as np import pandas as pd from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.model_selection import train_test_split from gensim.models.word2vec import Word2Vec #Step1:加载文件,导入数据,分词 def loadfile(): neg = pd.read_excel('./data/neg.xls',header = None,index = None) pos = pd.read_excel('./data/pos.xls',header = None,index = None) cw = lambda x: list(jieba.cut(x)) pos['words'] = pos[0].apply(cw) neg['words'] = neg[0].apply(cw) #Use '1' for positive,'0' for negative y = np.concatenate((np.ones(len(pos)),np.zeros(len(neg)))) x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos['words'],neg['words'])),y,test_size = 0.2,random_state=666) np.save('./svm_data/y_train.npy',y_train) np.save('./svm_data/y_test.npy',y_test) return x_train,x_test #Step2:计算词向量,并对每个评论的所有词向量取均值作为每个评论的输入 #这里的sentence应该指输入的一个句子,对句子的每个单词取词向量,然后取平均 def buildWordVector(sentence,size,w2v_model): vec = np.zeros(size).reshape((1,size)) count = 0. for word in sentence: try: vec += w2v_model[word].reshape((1,size)) count += 1. except KeyError: continue if count != 0: vec /= count return vec #计算词向量 def get_train_vecs(x_train,x_test): #词向量维度:100 n_dim = 100 #Initialize model and build vocab w2v_model = Word2Vec(size = n_dim,min_count = 10) w2v_model.build_vocab(x_train) #在训练集训练词向量模型 w2v_model.train(x_train,total_examples = w2v_model.corpus_count,epochs = w2v_model.iter) #生成训练集词向量 train_vecs = np.concatenate([buildWordVector(line,n_dim,w2v_model) for line in x_train]) #保存训练集词向量文件 np.save('svm_data/train_vecs.npy',train_vecs) print("Train word_vector shape:",train_vecs.shape) #在测试集训练词向量模型 w2v_model.train(x_test,total_examples = w2v_model.corpus_count,epochs = w2v_model.iter) #生成测试集词向量 test_vecs = np.concatenate([buildWordVector(line,n_dim,w2v_model) for line in x_test]) #保存测试集词向量文件 np.save('svm_data/test_vecs.npy',test_vecs) print("Test word_vector shape:",test_vecs.shape) #保存词向量模型 w2v_model.save('svm_data/w2v_model/w2v_model.pkl') #训练SVM模型 def svm_train(train_vecs,y_train,test_vecs,y_test): clf = SVC(kernel = 'rbf',verbose = True) clf.fit(train_vecs,y_train) #保存训练好的SVM模型 joblib.dump(clf,'svm_data/svm_model/model.pkl') print('SVM score:',clf.score(test_vecs,y_test)) #得到待预测单个句子的词向量 def get_predict_vecs(sentence): n_dim = 100 w2v_model = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl') predict_vecs = buildWordVector(sentence,n_dim,w2v_model) return predict_vecs #对待预测句子进行情感判断 def svm_predict(comment): sentence = jieba.lcut(comment) predict_vecs = get_predict_vecs(sentence) clf = joblib.load('svm_data/svm_model/model.pkl') result = clf.predict(predict_vecs) if int(result[0]) == 1: print(comment,'\n AI预测结果: 好评') else: print(comment,'\n AI预测结果: 差评') def get_data(): train_vecs = np.load('svm_data/train_vecs.npy') y_train = np.load('svm_data/y_train.npy') test_vecs = np.load('svm_data/test_vecs.npy') y_test = np.load('svm_data/y_test.npy') return train_vecs,y_train,test_vecs,y_test if __name__ == '__main__': #训练 x_train,x_test = loadfile() get_train_vecs(x_train,x_test) train_vecs,y_train,test_vecs,y_test = get_data() svm_train(train_vecs,y_train,test_vecs,y_test) #预测 string = '牛逼的手机,从3米高的地方摔下去都没坏,质量非常好' svm_predict(string)
点击查看更多内容
为 TA 点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦