1.数据处理之前先进行数据清洗,把自己所需要的好的数据提取出来
import codecs filepath=r"G:我的pythonpython基础大数据相关数据1E~001.txt" file=codecs.open(filepath,"rb","gbk","ignore") # 按照指定编码 mylist=file.readlines() # 返回一个list,读取到内存 # 保存加工的文件 # 数据分流(把数据进行分离,好的数据和坏的数据分开) savegoodfilepath = r"G:我的pythonpyhon基础day12QQQQGood.txt" savebadfilepath = r"G:我的pythonpython基础day12QQQQbed.txt" filegood = open(savegoodfilepath,"wb") filebad = open(savebadfilepath,"wb") for line in mylist: # print(QQlist[1]) if len(line) > 35 or len(line) <= 15: filebad.write(line.encode("utf-8")) else: QQlist = line.split("----") if len(QQlist) == 2: filegood.write(line.encode("utf-8","ignore")) else: filebad.write(line.encode("utf-8")) file.close() filegood.close() filebad.close()
2.数据提取,数据清洗之后就要从中提取出自己所需要的部分
filepath=r"G:我的pythonpython基础day12QQQQGood.txt" file = open(filepath,"rb") savefilepath = r"G:我的pythonpython基础day12QQQQGoodpass.txt" save = open(savefilepath,"wb") for line in file: linestr = line.decode("utf-8","ignore") mylist = linestr.split("----") save.write(mylist[1].encode("utf-8","ignore")) file.close() save.close()
3.数据排序 从提取的数据中 按照自己的需要进行排序
filepath=r"G:我的pythonpython基础day12QQQQGoodpass.txt" file = open(filepath,"rb") mylist=file.readlines() # 保存成列表 mylist.sort() # 排序 file.close() # 关闭文件 savefilepath = r"G:我的pythonpython基础day12QQQQGoodpasssort.txt" save = open(savefilepath,"wb") for line in mylist: line = line.decode("utf-8") save.write(line.encode("utf-8")) save.close()
4.排序计数 把重复一样的统计出来
filepath=r"G:我的pythonpython基础day12QQQQGoodpasssort.txt" file = open(filepath,"rb") mylist=file.readlines() length= len(mylist) file.close() savefilepath = r"G:我的pythonpython基础day12QQQQGoodpasssorttimes.txt" save = open(savefilepath,"wb") # 前提数据拍好顺序 # 重复数据统计 i = 0 while i < length: time = 1 passwordstr = mylist[i] while i < length - 1: if mylist[i] == mylist[i+1]: time += 1 i += 1 else: break save.write((str(time)+" "+passwordstr.decode("utf-8")).encode("utf-8")) i += 1 save.close()
5.数据分类
QQlist=[5,6,7,8,9,10,11,"小垃圾"] filepath = r"G:我的pythonpython基础day12QQQQGood.txt" file = open(filepath,"rb") mylist = file.readlines() file.close() # 创建文件对象 存放到列表中 filelist=[] for i in QQlist: QQfilepath = "G:\我的python\python基础\day12\QQ\QQ位数分类\"+str(i)+"位QQ.txt" QQfile = open(QQfilepath,"wb") filelist.append(QQfile) # 分类位置 for line in mylist: bakline = line line = line.decode("utf-8") linelist = line.split("----") length = len(linelist[0]) # 取账号的长度 if length == 5: filelist[0].write(bakline) elif length == 6: filelist[1].write(bakline) elif length == 7: filelist[2].write(bakline) elif length == 8: filelist[3].write(bakline) elif length == 9: filelist[4].write(bakline) elif length == 10: filelist[5].write(bakline) elif length == 11: filelist[6].write(bakline) else: filelist[7].write(bakline) # 关闭文件 for QQfile in filelist: QQfile.close()