python3.7 过滤出excel中重复数据,模糊匹配出用户名和邮箱相似数据----例子
发布日期:2021-06-30 15:37:01
浏览次数:3
分类:技术文章
本文共 8248 字,大约阅读时间需要 27 分钟。
import xlrdimport xlwtimport difflibarrayNum = 6tables = []newTables = []def read_excel(): # 打开文件 workbook = xlrd.open_workbook(r'test.xlsx') # 获取所有sheet sheet_name = workbook.sheet_names()[0] # 根据sheet索引或者名称获取sheet内容 sheet = workbook.sheet_by_index(0) # sheet索引从0开始 # sheet = workbook.sheet_by_name('Sheet1') #print (workboot.sheets()[0]) # sheet的名称,行数,列数 print (sheet.name,sheet.nrows,sheet.ncols) # 获取整行和整列的值(数组) rows = sheet.row_values(1) # 获取第2行内容 # cols = sheet.col_values(2) # 获取第3列内容 #print (rows) # print (cols) arr = [] for rown in range(sheet.nrows): array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''} array['UID'] = sheet.cell_value(rown,0) array['ID'] = sheet.cell_value(rown,1) array['email'] = sheet.cell_value(rown,2) array['IP'] = sheet.cell_value(rown,3) array['Login_IP'] = sheet.cell_value(rown,4) arr.append(array['Login_IP']) tables.append(array) #print(len(tables)) # print(tables[1].get('Login_IP')) #print(arr) arr_1=[] for i in range(len(tables)): # print(tables[i]['Login_IP']) if i !=0: num= arr.count(tables[i]['Login_IP']) if num>1: #arr.append(tables[i]['Login_IP']) arr_1.append(tables[i]) # print("过滤后的IP") # print(arr) #print("过滤后的数据") #print(arr_1) return arr_1 #print (tables[5])#过滤相似用户名def likeNumber(): # 打开文件 workbook = xlrd.open_workbook(r'aa.xlsx') # 根据sheet索引或者名称获取sheet内容 sheet = workbook.sheet_by_index(0) # sheet索引从0开始 # sheet = workbook.sheet_by_name('Sheet1') # print (workboot.sheets()[0]) # sheet的名称,行数,列数 # 获取整行和整列的值(数组) rows = sheet.row_values(1) # 获取第2行内容 # cols = sheet.col_values(2) # 获取第3列内容 # print (rows) # print (cols) arr = [] arr_2 =[] tables_1=[] for rown in range(sheet.nrows): array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''} array['UID'] = sheet.cell_value(rown, 0) array['ID'] = str(sheet.cell_value(rown, 1)) array['email'] = sheet.cell_value(rown, 2) array['IP'] = sheet.cell_value(rown, 3) array['Login_IP'] = sheet.cell_value(rown, 4) #print(array) arr.append(array['ID']) arr_2.append(array['email']) tables_1.append(array) # print(len(tables)) # print(tables[1].get('Login_IP')) # print(arr) # print(tables) a_arr=[] for i in range(len(tables_1)): if i != 0: a = difflib.get_close_matches(tables_1[i]['ID'], arr, 100, cutoff=0.7) if len(a) > 1: if a[0] != "": for i in range(len(a)): print("读取用户名:", a[i]) a_arr.append(a[i]) a_arr = list(set(a_arr)) #print (a_arr) #print (b_arr) arr_1 = [] for i in range(len(tables_1)): if i != 0: if tables_1[i]['ID'] in a_arr: arr_1.append(tables_1[i]) #print(arr_1) return arr_1 # print (tables[5])#过滤相似邮箱def likeEmail(): # 打开文件 workbook = xlrd.open_workbook(r'aa.xlsx') # 根据sheet索引或者名称获取sheet内容 sheet = workbook.sheet_by_index(0) # sheet索引从0开始 # sheet = workbook.sheet_by_name('Sheet1') # print (workboot.sheets()[0]) # sheet的名称,行数,列数 # 获取整行和整列的值(数组) rows = sheet.row_values(1) # 获取第2行内容 # cols = sheet.col_values(2) # 获取第3列内容 # print (rows) # print (cols) arr = [] arr_2 = [] tables_1 = [] for rown in range(sheet.nrows): array = {'UID': '', 'ID': '', 'email': '', 'IP': '', 'Login_IP': ''} array['UID'] = sheet.cell_value(rown, 0) array['ID'] = str(sheet.cell_value(rown, 1)) array['email'] = str(sheet.cell_value(rown, 2)) array['IP'] = sheet.cell_value(rown, 3) array['Login_IP'] = sheet.cell_value(rown, 4) # print(array) arr.append(array['ID']) arr_2.append(array['email']) tables_1.append(array) # print(len(tables)) # print(tables[1].get('Login_IP')) # print(arr) # print(tables) b_arr = [] for i in range(len(tables_1)): if i != 0: mStr =tables_1[i]['email'] email_1=mStr.split("@") b = difflib.get_close_matches(email_1[0], arr_2, 100, cutoff=0.4) if len(b) > 1: if b[0] != "": for i in range(len(b)): print("读取邮箱:", b[i]) b_arr.append(b[i]) b_arr = list(set(b_arr)) # print (a_arr) # print (b_arr) arr_1 = [] for i in range(len(tables_1)): if i != 0: if tables_1[i]['email'] in b_arr: arr_1.append(tables_1[i]) # print(arr_1) return arr_1 # print (tables[5])#写入数据class WriteExcel: # 初始化 def __init__(self, filename, sheet_name): self.work_book = xlwt.Workbook(encoding="UTF-8") self.worksheet = self.work_book.add_sheet(sheet_name) self.filename = filename self.row = 0 # 保存Excel def save(self): self.work_book.save(self.filename) # 设置样式 def set_style(self, name, height, bold=False, format_str='', align='center'): style = xlwt.XFStyle() # 初始化样式 font = xlwt.Font() # 为样式创建字体 font.name = name # 字体 font.bold = bold font.height = height borders = xlwt.Borders() # 为样式创建边框 borders.left = 1 borders.right = 1 borders.top = 1 borders.bottom = 1 alignment = xlwt.Alignment() # 设置排列 if align == 'center': alignment.horz = xlwt.Alignment.HORZ_CENTER alignment.vert = xlwt.Alignment.VERT_CENTER elif align == 'left': alignment.horz = xlwt.Alignment.HORZ_LEFT alignment.vert = xlwt.Alignment.VERT_BOTTOM else: alignment.horz = xlwt.Alignment.HORZ_RIGHT alignment.vert = xlwt.Alignment.VERT_BOTTOM style.font = font style.borders = borders style.num_format_str = format_str style.alignment = alignment return style # 设置标题的格式 def set_title_style(self): return self.set_style('黑体', 300, bold=True, format_str='') # 设置表头的格式 def set_head_style(self): head_style = self.set_style('Times New Roman', 220, bold=True, format_str='') pattern = xlwt.Pattern() # 一个实例化的样式类 pattern.pattern = xlwt.Pattern.SOLID_PATTERN # 固定的样式 pattern.pattern_fore_colour = xlwt.Style.colour_map['yellow'] # 背景颜色 head_style.pattern = pattern return head_style # 设置明细行的格式 def set_default_style(self): return self.set_style('Times New Roman', 200, bold=False, format_str='', align='right') # 添加标题 def add_title(self, title): self.worksheet.write_merge(0, 0, 0, 2, title, self.set_title_style()) self.row += 1 # 写入文件头 def add_head(self, key, value): # 向单元格中写入内容 self.worksheet.write(self.row, 0, key) self.worksheet.write(self.row, 1, value) self.row += 1 # 写入明细 def add_list(self, table_head, table_detail): self.row += 1 for i, value in enumerate(table_head): self.worksheet.write(self.row, i, value, self.set_head_style()) self.worksheet.col(i).width = 150 * 30 for rows in table_detail: self.row += 1 for i, key in enumerate(rows): self.worksheet.write(self.row, i, rows[key])if __name__ == '__main__': # 读取Excel list_detail= read_excel(); print ('读取成功') list_head = ["UID", "ID", "email","IP","Login_IP"] writeExcel = WriteExcel("aa.xlsx", "统计") writeExcel.add_title("IP统计表") writeExcel.add_list(list_head, list_detail) writeExcel.save() print ('写入成功') print ("=======================") print("第二次读取") likeNumber=likeNumber() print("用户名读取成功") writeExcel_1 = WriteExcel("用户名.xlsx", "用户名相似") writeExcel_1.add_title("用户名相似") writeExcel_1.add_list(list_head, likeNumber) writeExcel_1.save() print("用户名写入成功") print ("=======================") print("第三次读取") likeEmail = likeEmail() print("邮箱读取成功") writeExcel_1 = WriteExcel("邮箱.xlsx", "邮箱相似") writeExcel_1.add_title("邮箱相似") writeExcel_1.add_list(list_head, likeEmail) writeExcel_1.save() print("邮箱读取成功")
转载地址:https://jsonll.blog.csdn.net/article/details/103241722 如侵犯您的版权,请留言回复原文章的地址,我们会给您删除此文章,给您带来不便请您谅解!
发表评论
最新留言
很好
[***.229.124.182]2024年04月30日 22时09分07秒
关于作者
喝酒易醉,品茶养心,人生如梦,品茶悟道,何以解忧?唯有杜康!
-- 愿君每日到此一游!
推荐文章
包机制介绍
2019-05-01
JavaDoc---生成自己的API文档
2019-05-01
Scanner对象的介绍
2019-05-01
Java三种流程结构介绍
2019-05-01
Java 方法(函数)详解
2019-05-01
Java数组详解
2019-05-01
Java面向对象详解
2019-05-01
Java static 关键字
2019-05-01
Java抽象类
2019-05-01
Java接口介绍
2019-05-01
Java内部类
2019-05-01
在Debian 8上使用Apt-Get安装Java
2019-05-01
vs中动态DLL与静态LIB工程中加入版本信息的方法
2019-05-01
大数据分析技术与应用一站式学习(值得收藏)_v20200418
2019-05-01
ios开发手册_v20200718
2019-05-01
TortoiseGit客户端设置中文显示
2019-05-01
Qt 在windows下的串口读写
2019-05-01
如何在SpringBoot中使用Atomikos实现分布式事务
2019-05-01
如何在SpringMVC中配置Atomikos分布式事务
2019-05-01
Spring Boot 注解@EnableAutoConfiguration的解析
2019-05-01