本文共 14921 字,大约阅读时间需要 49 分钟。
备注:本程序是工信部机器学习相关培训样例程序
#!/usr/bin/env python3# -*- coding: utf-8 -*-import reimport unicodedataimport copyfrom xlrd import open_workbookfrom xlrd import XL_CELL_TEXT, XL_CELL_NUMBER, XL_CELL_DATE, XL_CELL_BOOLEANimport numpy as npfrom sklearn import tree# visualize codefrom sklearn.externals.six import StringIOimport pydotplusdef is_number(num): pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') result = pattern.match(num) if result: return True else: try: # 检查中文字表示的数字 unicodedata.numeric(num) return True except (TypeError, ValueError): pass return Falsedef sheet_to_array(filename, sheet_number, first_col=0, last_col=None, header=True): """Return a feature array and label list from sheet in an Excel spreadsheet. Notes: 0. The array is follow excel data and each column data type should be same. 1. If first_col is 0 and last_col is None, then all columns will be used, 2. If header is True, only one header row is assumed. 3. All rows are loaded. 4. first column is sequence number, last column is label, others are features column. """ DEBUG = False # sheet book = open_workbook(filename) sheet0 = book.sheet_by_index(sheet_number) rows = sheet0.nrows # 获得行数 # cols if not last_col: last_col = sheet0.ncols # 获得列数 if first_col >= last_col: raise Exception("First column must be smaller than last column!") cols = [col for col in range(first_col, last_col)] # rows skip = 0 header_names = [] if header: # 如果有标题行的处理 skip = 1 for item in sheet0.row(0): header_names.append(item.value) del header_names[0] del header_names[-1] print(header_names) #data = np.empty([len(cols), rows - skip]) print('define: ',len(cols),rows - skip) data = [[ '' for i in range(len(cols) - 2)] for j in range(rows - skip)] # 不使用numpy的array属性,自己构造含混合数据类型的二维列表 # print(data) datatype = ['' for i in range(len(cols) - 2)] # 保存每一列的数据属性 for col, cell in enumerate(sheet0.row(skip)[1:-1]): # 获得第一行的数据列表 datatype[col] = cell.ctype # 改为记录类型的数字值 # if cell.ctype == XL_CELL_NUMBER: # datatype[col] = 'number' # elif cell.ctype == XL_CELL_TEXT: # datatype[col] = 'text' # elif cell.ctype == XL_CELL_DATE: # datatype[col] = 'date' # elif cell.ctype == XL_CELL_BOOLEAN: # datatype[col] = 'bool' print(datatype) checktypediff = False # 用于检查每一列是否存在不同于第一行的数据类型 labellist = [] for row in range(skip, sheet0.nrows): row_values = sheet0.row(row) # 需要去掉第一列:序号,最后一列:标签数据(单列出来) label = row_values[-1] features = row_values[1:-1] #print(row_values) #print(features, label) labellist.append(label.value) labeltype = label.ctype for col, cell in enumerate(features): if DEBUG and row < 2: print(row, col, cell.ctype, cell.value, ' -- ',row - skip, col - first_col, '\n') #if col in cols and cell.ctype == XL_CELL_NUMBER: if col in cols: if cell.ctype == XL_CELL_NUMBER: data[row - skip][col - first_col] = cell.value elif cell.ctype == XL_CELL_TEXT: if is_number(cell.value): data[row - skip][col - first_col] = float(cell.value) else: data[row - skip][col - first_col] = cell.value.strip() # try text first else: data[row - skip][col - first_col] = cell.value # try text first if datatype[col] != cell.ctype: checktypediff = True #print(labellist) if checktypediff == True: print('发现某一列的数据存在不一致的数据类型!!!') #arraydata = np.array(data) # 转换为数组类型 return data, datatype, labellist, labeltype, header_names# 把输入的列表自定义字符串列表,转换为对应不同数字的列表值,并产生匹配字典def list_to_value(srcList): setSrcList = set(srcList) # 转为集合,用于去重 sndSrcList = list(setSrcList) # 转换回列表格式 outputDict = dict(zip(sndSrcList, range(len(sndSrcList)))) # 建立内容与数字的匹配, 转为字典格式 outputList = [] for item in srcList: outputList.append(outputDict[item]) return outputDict, outputListif __name__ == '__main__': filename = "../dataset/appleororange-2.xlsx" #filename = "houseloan-data-2.xlsx" #filename = "playtennis-data-2.xlsx" SHEET = 0 # the sheet number being processed featuredata, featuredatatype, labellist, labeltype, header = sheet_to_array(filename, SHEET, header=True) # 这样的复制,才会产生新的数列 rawfeaturedata = copy.deepcopy(featuredata) print('Features:', featuredata) print('Feature Type:', featuredatatype) cols = len(featuredata[0]) # 获得二维数组的列数 for j in range(cols): print(j, featuredatatype[j]) if featuredatatype[j] == XL_CELL_TEXT: # 是字符串类型,就进行数值匹配转换 thiscollist = [] for i in range(len(featuredata)): #print(featuredata[i][j], end= ', ') thiscollist.append(featuredata[i][j]) print(thiscollist) colDict, colValues = list_to_value(thiscollist) print(colDict, colValues) for i in range(len(featuredata)): featuredata[i][j] = colValues[i] print('New Features:', featuredata) print('Labels: ', labellist, labeltype) labelDict, labelValues = list_to_value(labellist) print(labelDict) print(labelValues) data = np.array(featuredata) #test_idx = [55, 56, 57, 58, 59] test_idx = [] # training data if len(test_idx) == 0: train_target = labelValues train_data = data else: train_target = np.delete(labelValues, test_idx) train_data = np.delete(data, test_idx, axis=0) # testing data if len(test_idx) == 0: test_target = labelValues test_data = data else: test_target = labelValues[55::] test_data = data[test_idx] #决策树算法调用核心语句 dt = tree.DecisionTreeClassifier(criterion='entropy') dt.fit(train_data, train_target) # 预测结果 print(test_target) print(dt.predict(test_data)) dot_data = StringIO() print(labelDict.keys()) tree.export_graphviz(dt , out_file = dot_data, feature_names = header, # header class_names = list(labelDict.keys()), # label filled = True, rounded = True, impurity = False) print(dot_data) #graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph = pydotplus.graph_from_dot_data(dot_data.getvalue().replace('helvetica','"Microsoft YaHei"')) print(graph) outputfilename = filename+'-graph' graph.write_pdf(outputfilename+".pdf") graph.write_jpg(outputfilename+".jpg")
运行结果
runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')['重量', '表皮光滑度']define: 4 6[['', ''], ['', ''], ['', ''], ['', ''], ['', ''], ['', '']][2, 1]Features: [[140.0, '光滑'], [130.0, '光滑'], [150.0, '粗糙'], [170.0, '粗糙'], [150.0, '光滑'], [130.0, '粗糙']]Feature Type: [2, 1]0 21 1['光滑', '光滑', '粗糙', '粗糙', '光滑', '粗糙']{'光滑': 0, '粗糙': 1} [0, 0, 1, 1, 0, 1]New Features: [[140.0, 0], [130.0, 0], [150.0, 1], [170.0, 1], [150.0, 0], [130.0, 1]]Labels: ['橘子', '橘子', '苹果', '苹果', '橘子', '橘子'] 1{'苹果': 0, '橘子': 1}[1, 1, 0, 0, 1, 1][1, 1, 0, 0, 1, 1][1 1 0 0 1 1]dict_keys(['苹果', '橘子'])<_io.StringIO object at 0x000001D93BBA00D8>In [16]:
127行修改源数据文件,如下
filename = "../dataset/houseloan-data-2.xlsx"156行修改
test_idx = [55, 56, 57, 58, 59]#test_idx = []
重新执行
runfile('D:/ai/py/dicision-tree.py', wdir='D:/ai/py')['年龄', '学历', '月收入', '婚姻状况', '是否有房产']define: 7 60[['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', ''], ['', '', '', '', '']][2, 1, 1, 1, 1]Features: [[26.0, '本科', '低', '未婚', '否'], [24.0, '大专', '低', '已婚', '否'], [28.0, '硕士', '中', '未婚', '是'], [29.0, '本科', '低', '已婚', '是'], [29.0, '博士', '高', '已婚', '否'], [32.0, '本科', '中', '已婚', '是'], [26.0, '硕士', '中', '未婚', '否'], [33.0, '本科', '低', '未婚', '是'], [35.0, '研究生', '中', '已婚', '是'], [31.0, '大专', '中', '已婚', '否'], [30.0, '本科', '低', '未婚', '是'], [29.0, '研究生', '高', '已婚', '否'], [30.0, '大专', '中', '已婚', '否'], [34.0, '本科', '高', '未婚', '是'], [27.0, '本科', '高', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [23.0, '大专', '低', '已婚', '否'], [30.0, '大专', '低', '未婚', '是'], [23.0, '博士', '低', '未婚', '否'], [25.0, '硕士', '中', '未婚', '是'], [26.0, '本科', '中', '已婚', '是'], [24.0, '本科', '中', '已婚', '否'], [32.0, '硕士', '中', '未婚', '否'], [34.0, '博士', '中', '已婚', '否'], [38.0, '大专', '中', '已婚', '是'], [35.0, '大专', '低', '未婚', '否'], [34.0, '本科', '低', '已婚', '否'], [32.0, '硕士', '低', '未婚', '是'], [30.0, '大专', '低', '未婚', '否'], [31.0, '本科', '低', '已婚', '是'], [40.0, '大专', '高', '未婚', '否'], [34.0, '大专', '中', '已婚', '否'], [45.0, '大专', '中', '已婚', '是'], [53.0, '大专', '高', '未婚', '是'], [34.0, '大专', '高', '已婚', '是'], [23.0, '本科', '高', '未婚', '否'], [35.0, '本科', '中', '已婚', '否'], [54.0, '大专', '高', '已婚', '是'], [65.0, '本科', '高', '未婚', '否'], [45.0, '大专', '中', '已婚', '是'], [34.0, '大专', '低', '未婚', '否'], [37.0, '本科', '中', '已婚', '否'], [42.0, '大专', '高', '已婚', '是'], [44.0, '本科', '中', '未婚', '是'], [28.0, '本科', '中', '已婚', '否'], [26.0, '本科', '中', '已婚', '是'], [25.0, '本科', '中', '已婚', '否'], [43.0, '博士', '高', '已婚', '否'], [48.0, '本科', '中', '未婚', '是'], [36.0, '硕士', '高', '未婚', '是'], [33.0, '大专', '中', '已婚', '是'], [30.0, '本科', '中', '已婚', '否'], [28.0, '研究生', '高', '未婚', '否'], [26.0, '研究生', '高', '已婚', '否'], [32.0, '硕士', '高', '已婚', '是'], [42.0, '本科', '中', '已婚', '否'], [47.0, '硕士', '高', '已婚', '否'], [45.0, '大专', '中', '未婚', '是'], [37.0, '本科', '中', '已婚', '是'], [35.0, '本科', '中', '已婚', '否']]Feature Type: [2, 1, 1, 1, 1]0 21 1['本科', '大专', '硕士', '本科', '博士', '本科', '硕士', '本科', '研究生', '大专', '本科', '研究生', '大专', '本科', '本科', '本科', '大专', '大专', '博士', '硕士', '本科', '本科', '硕士', '博士', '大专', '大专', '本科', '硕士', '大专', '本科', '大专', '大专', '大专', '大专', '大专', '本科', '本科', '大专', '本科', '大专', '大专', '本科', '大专', '本科', '本科', '本科', '本科', '博士', '本科', '硕士', '大专', '本科', '研究生', '研究生', '硕士', '本科', '硕士', '大专', '本科', '本科']{'研究生': 0, '大专': 1, '硕士': 2, '本科': 3, '博士': 4} [3, 1, 2, 3, 4, 3, 2, 3, 0, 1, 3, 0, 1, 3, 3, 3, 1, 1, 4, 2, 3, 3, 2, 4, 1, 1, 3, 2, 1, 3, 1, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3, 3, 3, 3, 4, 3, 2, 1, 3, 0, 0, 2, 3, 2, 1, 3, 3]2 1['低', '低', '中', '低', '高', '中', '中', '低', '中', '中', '低', '高', '中', '高', '高', '中', '低', '低', '低', '中', '中', '中', '中', '中', '中', '低', '低', '低', '低', '低', '高', '中', '中', '高', '高', '高', '中', '高', '高', '中', '低', '中', '高', '中', '中', '中', '中', '高', '中', '高', '中', '中', '高', '高', '高', '中', '高', '中', '中', '中']{'低': 0, '高': 1, '中': 2} [0, 0, 2, 0, 1, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 2, 1, 1, 2, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2]3 1['未婚', '已婚', '未婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '未婚', '已婚', '已婚', '未婚', '已婚', '已婚', '已婚', '已婚', '未婚', '已婚', '已婚']{'未婚': 0, '已婚': 1} [0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1]4 1['否', '否', '是', '是', '否', '是', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '是', '否', '是', '是', '否', '否', '否', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '是', '否', '是', '否', '否', '是', '是', '否', '是', '否', '否', '是', '是', '是', '否', '否', '否', '是', '否', '否', '是', '是', '否']{'否': 0, '是': 1} [0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]New Features: [[26.0, 3, 0, 0, 0], [24.0, 1, 0, 1, 0], [28.0, 2, 2, 0, 1], [29.0, 3, 0, 1, 1], [29.0, 4, 1, 1, 0], [32.0, 3, 2, 1, 1], [26.0, 2, 2, 0, 0], [33.0, 3, 0, 0, 1], [35.0, 0, 2, 1, 1], [31.0, 1, 2, 1, 0], [30.0, 3, 0, 0, 1], [29.0, 0, 1, 1, 0], [30.0, 1, 2, 1, 0], [34.0, 3, 1, 0, 1], [27.0, 3, 1, 0, 1], [26.0, 3, 2, 1, 1], [23.0, 1, 0, 1, 0], [30.0, 1, 0, 0, 1], [23.0, 4, 0, 0, 0], [25.0, 2, 2, 0, 1], [26.0, 3, 2, 1, 1], [24.0, 3, 2, 1, 0], [32.0, 2, 2, 0, 0], [34.0, 4, 2, 1, 0], [38.0, 1, 2, 1, 1], [35.0, 1, 0, 0, 0], [34.0, 3, 0, 1, 0], [32.0, 2, 0, 0, 1], [30.0, 1, 0, 0, 0], [31.0, 3, 0, 1, 1], [40.0, 1, 1, 0, 0], [34.0, 1, 2, 1, 0], [45.0, 1, 2, 1, 1], [53.0, 1, 1, 0, 1], [34.0, 1, 1, 1, 1], [23.0, 3, 1, 0, 0], [35.0, 3, 2, 1, 0], [54.0, 1, 1, 1, 1], [65.0, 3, 1, 0, 0], [45.0, 1, 2, 1, 1], [34.0, 1, 0, 0, 0], [37.0, 3, 2, 1, 0], [42.0, 1, 1, 1, 1], [44.0, 3, 2, 0, 1], [28.0, 3, 2, 1, 0], [26.0, 3, 2, 1, 1], [25.0, 3, 2, 1, 0], [43.0, 4, 1, 1, 0], [48.0, 3, 2, 0, 1], [36.0, 2, 1, 0, 1], [33.0, 1, 2, 1, 1], [30.0, 3, 2, 1, 0], [28.0, 0, 1, 0, 0], [26.0, 0, 1, 1, 0], [32.0, 2, 1, 1, 1], [42.0, 3, 2, 1, 0], [47.0, 2, 1, 1, 0], [45.0, 1, 2, 0, 1], [37.0, 3, 2, 1, 1], [35.0, 3, 2, 1, 0]]Labels: ['否', '否', '是', '否', '是', '是', '否', '否', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否', '是', '是', '是', '否', '否', '是', '否', '否', '否', '否', '否', '是', '否', '否', '是', '是', '是', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '否', '是', '是', '是', '是', '是', '否', '是', '否'] 1{'否': 0, '是': 1}[0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0][1, 1, 0, 1, 0][0 1 0 1 0]dict_keys(['否', '是'])<_io.StringIO object at 0x000001D93BB73168>
转载地址:http://ngvab.baihongyu.com/