python实战——文本挖掘+xgboost预测+数据处理+准确度计算整合版

if __name__=="__main__":  

    '''============================先导入数据=================================='''

    file_train = 'F:/goverment/exceloperating/all_tocai_train.csv'

    file_test = 'F:/goverment/exceloperating/all_tocai_test.csv'

    importSmallContentdata(file_train,Straindata,Sart_train,Strainlabel,0)

    importSmallContentdata(file_test,Stestdata,Sart_test,Stestlabel,1)

    #print("Stestlabel" ,len(Stestlabel))

    #print("小类导入数据完毕")

    importBtestlabel(file_test)

    #print("大类标签导入完毕")#共1329*4

from pandas import read_csv

import numpy as np

from sklearn.datasets.base import Bunch

import pickle    #导入cPickle包并且取一个别名pickle #持久化类

from sklearn.feature_extraction.text import TfidfVectorizer

import jieba

import xlwt

import operator#排序用

Straindata=[]

Strainlabel=[]

Sart_train=[]

Stestdata=[]

Stestlabel=[]

Sart_test=[]

Slast=[]

Snew=[]

BvsS=[]

Bnew=[]

Blast=[]

class obj:

    def __init__(self):

        self.key=0

        self.weight=0.0

def importSmallContentdata(file,data,art,label,f):

    dataset=read_csv(file)

    Sdata = dataset.values[:,:]

    print(type(Sdata))

    if f==1:

        for line in Sdata:

            ls=[]

            ls.append(line[18])

            ls.append(line[19])

            ls.append(line[20])

            ls.append(line[21])

            Slast.append(ls)

        #print(len(Slast))

        #print("需要对照的小类数据准备完毕")

    if f==0:

        BvsS2=[]

        for line in Sdata:

            BvsS2.append(line[18])

            BvsS2.append(line[19])

            BvsS2.append(line[20])

            BvsS2.append(line[21])

        '''去重'''

        bol=np.zeros(len(Sdata)+1)

        for i in BvsS2:

            if bol[i]== 0:

                BvsS.append(i)

                bol[i]=1

        BvsS.sort()

        print(BvsS)

    '''找到smalli不为0的装入Straindata,把数据分开'''

    for smalli in range(18,22):

        #print(smalli)

        count=0

        for line in Sdata:

            count=count+1

            if line[smalli]!='' and line[smalli]!=0 :

                k=1

                ls=[]

                for i in line:

                    if k==1:

                        art.append(i)

                        k=k+1

                        continue

                    if k==15:#k为14并不代表是line[14]，因为line是从0开始

                        break

                    ls.append(float(i))

                    k=k+1

                data.append(ls)

                label.append(line[smalli])

                if f==1:

                    Snew.append(count)

   # print("为什么都超限",len(Snew))

def importBtestlabel(file):

    dataset=read_csv(file)

    Bdata = dataset.values[:,:]

    for line in Bdata:

        ls=[]

        ls.append(line[14])

        ls.append(line[15])

        ls.append(line[16])

        ls.append(line[17])

        Blast.append(ls)

首先导入数据

列O到列P为标签，我们先预测small的4列，先将四列分开，预测完以后，取支持度最高的前四个作为预测结果，与原数据比较，比较的准则是：本该有的都有的即可，即eg：原：1,2,0,9，则预测出来是 9,2,5,1，也是正确的，方法：将预测出来一条记录的放到由52（small的范围是0-51）个01组成的列表中中，若预测出来是9，2,5,1，那么第9个，第2个，第5个，第1个为1，其余为0，对照的时候，只要第9个，第1个，第2个为0就正确（0代表没有，不需要对照）

对文字部分我们要提取特征，取前71个特征，不足补齐为-1

def getKvector(train_set,vec,n):

    nonzero=train_set.tdm.nonzero()

    k=0

    lis=[]

    gather=[]

    p=-1

    for i in nonzero[0]:

        p=p+1

        if k==i:

            a=obj()

            a.key=nonzero[1][p]

            a.weight=train_set.tdm[i,nonzero[1][p]]

            lis.append(a)

        else:

            lis.sort(key=lambda obj: obj.weight, reverse=True)#对链表内为类对象的排序

            gather.append(lis)

            while k < i:

                k=k+1

            lis=[]

            a=obj()

            a.key=nonzero[1][p]

            a.weight=train_set.tdm[i,nonzero[1][p]]

            lis.append(a)

    gather.append(lis)#gather存储的是每条数据的事实描述的特征向量，已经从小到大排好了，只不过每个存既有key又有weight

    #我们只要key，不再需要weight

    sj=1

    for i in gather:

        ls=[]

        for j in i:

            sj=sj+1

            ls.append(float(j.key))

        while sj<=n:

            sj=sj+1

            ls.append(-1)

        sj=1

        vec.append(ls)

'''读取停用词'''

def _readfile(path):

    with open(path, "rb") as fp:

        content = fp.read()

    return content  

''' 读取bunch对象'''

def _readbunchobj(path):

    with open(path, "rb") as file_obj:

        bunch = pickle.load(file_obj)

    return bunch  

'''写入bunch对象'''

def _writebunchobj(path, bunchobj):

    with open(path, "wb") as file_obj:

        pickle.dump(bunchobj, file_obj) 

def buildtrainbunch(bunch_path,art_train,trainlabel):

    bunch = Bunch(label=[],contents=[])

    for item1 in trainlabel:

        bunch.label.append(item1)

    #trainContentdatasave=[] #存储所有训练和测试数据的分词

    for item2 in art_train:

        item2=str(item2)

        item2 = item2.replace("\r\n", "")

        item2 = item2.replace(" ", "")

        content_seg=jieba.cut(item2)

        save2=''

        for item3 in content_seg:

            if len(item3) > 1 and item3!='\r\n':

                #trainContentdatasave.append(item3)

                save2=save2+","+item3

        bunch.contents.append(save2)

    with open(bunch_path, "wb") as file_obj:

        pickle.dump(bunch, file_obj)

    print("构建训练数据文本对象结束！！！")

def buildtestbunch(bunch_path,art_test,testlabel):

    bunch = Bunch(label=[],contents=[])

    for item1 in testlabel:

        bunch.label.append(item1)

    #testContentdatasave=[] #存储所有训练和测试数据的分词

    for item2 in art_test:

        item2=str(item2)

        item2 = item2.replace("\r\n", "")

        item2 = item2.replace(" ", "")

        content_seg=jieba.cut(item2)

        save2=''

        for item3 in content_seg:

            if len(item3) > 1 and item3!='\r\n':

                #testContentdatasave.append(item3)

                save2=save2+","+item3

        bunch.contents.append(save2)

    with open(bunch_path, "wb") as file_obj:

        pickle.dump(bunch, file_obj)

    print("构建测试数据文本对象结束！！！")

def vector_space(stopword_path,bunch_path,space_path):

    stpwrdlst = _readfile(stopword_path).splitlines()#读取停用词

    bunch = _readbunchobj(bunch_path)#导入分词后的词向量bunch对象

    #构建tf-idf词向量空间对象

    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})  

    #权重矩阵tdm，其中，权重矩阵是一个二维矩阵，tdm[i][j]表示，第j个词（即词典中的序号）在第i个类别中的IF-IDF值

    #使用TfidVectorizer初始化向量空间模型

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001,use_idf=True,max_features=15000)

    #print(vectorizer)

    #文本转为词频矩阵，单独保存字典文件

    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    tfidfspace.vocabulary = vectorizer.vocabulary_

    #创建词袋的持久化

    _writebunchobj(space_path, tfidfspace)

    print("if-idf词向量空间实例创建成功！！！")

def testvector_space(stopword_path,bunch_path,space_path,train_tfidf_path):

    stpwrdlst = _readfile(stopword_path).splitlines()#把停用词变成列表

    bunch = _readbunchobj(bunch_path)

    tfidfspace = Bunch(label=bunch.label,tdm=[], vocabulary={})

    #导入训练集的TF-IDF词向量空间  ★★

    trainbunch = _readbunchobj(train_tfidf_path)

    tfidfspace.vocabulary = trainbunch.vocabulary  

    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary, min_df=0.001)  

    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    _writebunchobj(space_path, tfidfspace)

    print("if-idf词向量空间实例创建成功！！！")

  '''==========================================================tf-idf对Bar进行文本特征提取============================================================================'''

    #导入分词后的词向量bunch对象

    train_bunch_path ="F:/goverment/exceloperating/trainbunch.bat"#Bunch保存路径

    train_space_path = "F:/goverment/exceloperating/traintfdifspace.dat"

    test_bunch_path ="F:/goverment/exceloperating/testbunch.bat"

    test_space_path = "F:/goverment/exceloperating/testtfdifspace.dat"

    stopword_path ="F:/goverment/exceloperating/hlt_stop_words.txt"

    '''============================================================tf-idf对Sart进行文本特征提取=============================================================================='''

    buildtrainbunch(train_bunch_path,Sart_train,Strainlabel)

    buildtestbunch(test_bunch_path,Sart_test,Stestlabel)

    vector_space(stopword_path,train_bunch_path,train_space_path)

    testvector_space(stopword_path,test_bunch_path,test_space_path,train_space_path)

    train_set=_readbunchobj(train_space_path)

    test_set=_readbunchobj(test_space_path)

    '''训练数据'''

    S_vec_train=[]

    getKvector(train_set,S_vec_train,76)

    '''测试数据'''

    S_vec_test=[]

    getKvector(test_set,S_vec_test,76)

    '''=================将得到的61个特征和之前的其它特征合并Btraindata=================='''

    '''小类训练数据'''

    S_vec_train=np.array(S_vec_train)

    #print(type(S_vec_train))

    #print(S_vec_train.shape)

    Straindata=np.array(Straindata)

    #print(type(Straindata))

    #print(Straindata.shape)

    Straindata=np.hstack((S_vec_train,Straindata))

    #print(Straindata)

    '''小类测试数据'''

    S_vec_test=np.array(S_vec_test)

    Stestdata=np.array(Stestdata)

    Stestdata=np.hstack((S_vec_test,Stestdata))

    '''

    B_vec_test=np.array(B_vec_test)

    Btestdata=np.array(Btestdata)

    Btestdata=np.hstack((B_vec_test,Btestdata))

    import xlwt

    myexcel = xlwt.Workbook()

    sheet = myexcel.add_sheet('sheet')

    si=-1

    sj=-1

    for i in range(len(Btraindata)):

        si=si+1

        for j in range(len(Btraindata[0])):

            sj=sj+1

            sheet.write(si,sj,str(Btraindata[i,j]))

        sj=-1

    myexcel.save("vector.xls")

# =============================================================================

#     import xlwt

#     myexcel = xlwt.Workbook()

#     sheet = myexcel.add_sheet('sheet')

#     si=-1

#     sj=-1

#     for i in range(len(Btestdata)):

#         si=si+1

#         for j in range(len(Btestdata[0])):

#             sj=sj+1

#             sheet.write(si,sj,str(Btestdata[i,j]))

#         sj=-1

#     myexcel.save("vector.xls")

接下里，我用了XGBClassifier来分类，因为这是我尝试了多个分类器后，准确度最高的。

    '''==========================分类算精度==========================='''

    print("分类算小类精度")

    Strainlabel=np.array(Strainlabel)

    Strainlabel=np.array(Strainlabel)

    from xgboost import XGBClassifier

    from sklearn import metrics

    clf= XGBClassifier(learning_rate =0.1,

     n_estimators=1150,

     max_depth=2,

     min_child_weight=1,

     gamma=0,

     subsample=0.8,

     colsample_bytree=0.8,

     objective= 'binary:logistic',

     nthread=4,#没用

     scale_pos_weight=1,#没用

     seed=27)

    clf.fit(Straindata, Strainlabel)

    predict=clf.predict(Stestdata)

    aa=metrics.accuracy_score(Stestlabel, predict)

最后就是对照的过程，数据处理过程比较繁琐。把大类的概率定义为小类相加，比如预测一个工程存在的大类技术问题，大类1的概率=小类1.1的概率+1.2+1.3，以此类推，再从11个概率中挑前3个最大概率对应的大类问题。

下图是大类与小类的对应关系：

    ''''============================输出技术问题及其可能性================'''

    class attri:

        def __init__(self):

            self.key=0

            self.weight=0.0

    '''====================小类======================='''

    attribute_proba=clf.predict_proba(Stestdata)

    #print(type(attribute_proba))

    '''======================================================='''

# =============================================================================

#     myexcel = xlwt.Workbook()

#     sheet = myexcel.add_sheet('sheet')

#     si=-1

#     sj=-1

#     #cys=1

#     for i in attribute_proba:

#         si=si+1

#          #print("对于记录 %d:" % cys)

#         #cys=cys+1

#         for j in i:

#             sj=sj+1

#             sheet.write(si,sj,str(j))

#             #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

#         sj=-1

#     myexcel.save("proba.xls")

# =============================================================================

    '''==========================================================================='''

    '''获得小类发生概率和序号的结合，第一个数为序号'''

    Bnew2=[]

    count=0

    for i in attribute_proba:

        ls=[]

        ls.append(Snew[count])

        count=count+1

        for j in i:

            ls.append(j)

        Bnew2.append(ls)

    '''对Bnew去重'''

    bol=np.zeros(len(Slast)+1)

    for lis in Bnew2:

        if bol[lis[0]]==0:

            Bnew.append(lis)

            bol[lis[0]]=1

    #print(len(Bnew))#去重后为1162

    for i in range(len(Slast)+1):

        if i==0:

            continue

        if bol[i]==0:

            ls=[]

            ls.append(i)

            for j in range(len(attribute_proba[0])):

                ls.append(0)

            Bnew.append(ls)

    #print("Bnew",len(Bnew)) #为1329

    Bnew.sort(key=operator.itemgetter(0))

    #print(Bnew)

    Bpro=[]

    for line in Bnew:

        ls=np.zeros(12)

        for i in range(len(line)):

            if i==0:

                continue

            elif BvsS[i]==0:

                ls[0]+=line[i]

            elif BvsS[i]==1 or BvsS[i]==2 or BvsS[i]==3:

                ls[1]+=line[i]

            elif BvsS[i]==4 or BvsS[i]==5 or BvsS[i]==6:

                ls[2]+=line[i]

            elif BvsS[i]==8 or BvsS[i]==9 or BvsS[i]==10 or BvsS[i]==11 or BvsS[i]==12:

                ls[3]+=line[i]

            elif BvsS[i]==13 or BvsS[i]==14 or BvsS[i]==15 or BvsS[i]==16:

                ls[4]+=line[i]

            elif BvsS[i]==17 or BvsS[i]==18 or BvsS[i]==19 or BvsS[i]==20 or BvsS[i]==21 or BvsS[i]==22 or BvsS[i]==23 or BvsS[i]==24:

                ls[5]+=line[i]

            elif BvsS[i]==25 or BvsS[i]==26 or BvsS[i]==27 or BvsS[i]==28 or BvsS[i]==29:

                ls[6]+=line[i]

            elif BvsS[i]==30 or BvsS[i]==31 or BvsS[i]==32:

                ls[7]+=line[i]

            elif BvsS[i]==33:

                ls[8]+=line[i]

            elif BvsS[i]==34 or BvsS[i]==35 or BvsS[i]==36 or BvsS[i]==37:

                ls[9]+=line[i]

            elif BvsS[i]==38 or BvsS[i]==39 :

                ls[10]+=line[i]

            else:

                ls[11]+=line[i]

        Bpro.append(ls)

    Bnew=[]

    Bnew4=[]

    for line in Bpro:

        ls=[]

        for i in range(len(line)):

            a=obj()

            a.key=i

            a.weight=line[i]

            ls.append(a)

        ls.sort(key=lambda obj: obj.weight, reverse=True)

        Bnew.append(ls)

        lis=[]

        lis.append(ls[0].key)

        lis.append(ls[1].key)

        lis.append(ls[2].key)

        lis.append(ls[3].key)

        Bnew4.append(lis)

    #print(Bnew4)

    '''计算大类准确率,转成01编码'''

    Bnew_one=[]

    for lis in Bnew4:

        bol=np.zeros(12)

        bol=bol.tolist()

        bol[lis[0]],bol[lis[1]],bol[lis[2]],bol[lis[3]]=1,1,1,1

        Bnew_one.append(bol)

    Bnew_one=np.array(Bnew_one)

    #print(Snew_one.shape)

    #print(Slast_one.shape)#都为1329*51

    count=0

    si=-1

    for line in Blast:

        si+=1

        f=1

        for j in line:

            if j == 0:

                continue

            if Bnew_one[si,j]!=1:

                f=0

                break

        if f==1:

            count=count+1

    print("大类准确率为：",count*1.0/s)

    '''准确率计算完毕'''

# =============================================================================

#

#     myexcel = xlwt.Workbook()

#     sheet = myexcel.add_sheet('sheet')

#     si=-1

#     sj=-1

#     #cys=1

#     for i in Bnew4:

#         si=si+1

#          #print("对于记录 %d:" % cys)

#         #cys=cys+1

#         for j in i:

#             sj=sj+1

#             sheet.write(si,sj,str(j))

#             #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

#         sj=-1

#     myexcel.save("Bpro_new.xls")

#

#     myexcel = xlwt.Workbook()

#     sheet = myexcel.add_sheet('sheet')

#     si=-1

#     sj=-1

#     #cys=1

#     for i in Blast:

#         si=si+1

#          #print("对于记录 %d:" % cys)

#         #cys=cys+1

#         for j in i:

#             sj=sj+1

#             sheet.write(si,sj,str(j))

#             #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

#         sj=-1

#     myexcel.save("Bpro_last.xls")

#

# =============================================================================

# =============================================================================

#     class obj:

#     def __init__(self):

#         self.key=0

#         self.weight=0.0

# =============================================================================

    '''================================================================================'''

    label=[]

    for i in attribute_proba:

        lis=[]

        k=0

        while k<4:

            k=k+1

            p=1

            mm=0

            sj=-1

            for j in i:

                sj=sj+1

                if j>mm:

                    mm=j

                    p=sj

            i[p]=0#难道是从1开始？

            #a=attri()

            #a.key=p

            #a.weight=mm

            #lis.append(a)

            lis.append(p)

        label.append(lis)

    #接下来将label和snew结合，再排序去重就可以和slast比较了

    #print("为什么都超限",len(Snew))

    #print("label",len(label))

    count=0

    for lis in label:

        lis.append(Snew[count])

        count=count+1

    print("结合完成，准备去重！")#此时label和Snew的长度都为1439

    bol=np.zeros(len(label)+1)

    Snew=[]

    for lis in label:

        if bol[lis[4]]==0:

            Snew.append(lis)

            bol[lis[4]]=1

    #print(len(Snew))#去重后为1162

    for i in range(len(Slast)+1):

        if i==0:

            continue

        if bol[i]==0:

            ls=[]

            ls.append(0)

            ls.append(0)

            ls.append(0)

            ls.append(0)

            ls.append(i)

            Snew.append(ls)

    #print("Snew",len(Snew)) #为1329

    print("去重完毕，准备排序！")

    Snew.sort(key=operator.itemgetter(4))

    print("排序完毕，准备比较!")

    s=len(Snew)

    #print(s)

    Snew_one=[]

    for lis in Snew:

        bol=np.zeros(51)

        bol=bol.tolist()

        bol[lis[0]],bol[lis[1]],bol[lis[2]],bol[lis[3]]=1,1,1,1

        Snew_one.append(bol)

    Snew_one=np.array(Snew_one)

    #print(Snew_one.shape)

    #print(Slast_one.shape)#都为1329*51

    count=0

    si=-1

    for line in Slast:

        si+=1

        f=1

        for j in line:

            if j == 0:

                continue

            if Snew_one[si,j]!=1:

                f=0

                break

        if f==1:

            count=count+1

            print(si)

    print(count)

    print(s)

    print("小类准确率为：",count*1.0/s)

    myexcel = xlwt.Workbook()

    sheet = myexcel.add_sheet('sheet')

    si=-1

    sj=-1

    #cys=1

    #print(Snew)

    for i in Snew:

        si=si+1

         #print("对于记录 %d:" % cys)

        #cys=cys+1

        for j in i:

            sj=sj+1

            sheet.write(si,sj,str(j))

            #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

        sj=-1

    myexcel.save("Snew.xls") 

    myexcel = xlwt.Workbook()

    sheet = myexcel.add_sheet('sheet')

    si=-1

    sj=-1

    #cys=1

    for i in Slast:

        si=si+1

         #print("对于记录 %d:" % cys)

        #cys=cys+1

        for j in i:

            sj=sj+1

            sheet.write(si,sj,str(j))

            #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

        sj=-1

    myexcel.save("Slast1.xls")       

# =============================================================================

#     print('挑几个输出')

#     import xlwt

#     myexcel = xlwt.Workbook()

#     sheet = myexcel.add_sheet('sheet')

#     si=-2

#     sj=-1

#     #cys=1

#     for i in label:

#         si=si+2

#          #print("对于记录 %d:" % cys)

#         #cys=cys+1

#         for j in i:

#             sj=sj+1

#             sheet.write(si,sj,str(j.key))

#             sheet.write(si+1,sj,str(j.weight))

#             #print ("发生技术问题 %d 的可能性是:%.2f %%" % (j.key,j.weight*100))

#         sj=-1

#     myexcel.save("proba_small.xls")

#

# =============================================================================

# -*- coding: utf-8 -*-

python实战——文本挖掘+xgboost预测+数据处理+准确度计算整合版

python实战——文本挖掘+xgboost预测+数据处理+准确度计算整合版的相关教程结束。

相关推荐

机器学习实战（Machine Learning in Action）学习笔记————03.决策树原理、源码解析及测试

SpringBoot自定义注解+AOP+redis实现防接口幂等性重复提交，从概念到实战

SQL注入三连实战绕过WTS-WAF

Java多线程编程模式实战指南一：Active Object模式（上）

spring boot插件开发实战和原理

【eclipse插件开发实战】Eclipse插件开发2——SWT

【eclipse插件开发实战】Eclipse插件开发3——OSGi、RCP

【eclipse插件开发实战】Eclipse插件开发7——插件发布jar包