import xlwt import openpyxl from urllib.parse import * import xlrd def eq(l): workbook = xlrd.open_workbook('data.xlsx') table = workbook.sheet_by_index(0) all_data=[] url=[] test=[] for i in range(0,table.nrows): u = table.cell(i, 1).value ip=table.cell(i,0).value #print(ip) if u not in l: print(u," ",ip) #文件移动函数 def moveFile(srcfile,dstfile): if not os.path.isfile(srcfile): print ("%s 该文件不存在!请检查您的输入"%(srcfile)) else: fpath,fname=os.path.split(dstfile) #分离文件名和路径 if not os.path.exists(fpath): os.makedirs(fpath) #创建路径 shutil.move(srcfile,dstfile) #移动文件 def searchdata(l,dir): workbook=xlrd.open_workbook('C:\Users\yxb\Downloads\汇总高危\网站基本信息20200424(1).xls') workbook2=xlrd.open_workbook('C:\Users\yxb\Downloads\汇总高危\网站群网站清单_20200312入库(1).xlsx') table1=workbook.sheet_by_index(0) table2=workbook2.sheet_by_index(0) all_data=[] un=[] ip=[] url=[] start=[] end=[] for i in range(0,table1.nrows): u=table1.cell(i,4).value unit=table1.cell(i,-1).value ip.append(table1.cell(i,5).value) un.append(table1.cell(i,16).value) if u=='*' or u=='无': u=table1.cell(i,5).value all_data.append(unit) url.append(u) url.append(u) all_data.append(unit) for i in range(0,table2.nrows): u=table2.cell(i,0).value unit=table2.cell(i,3).value url.append(u) all_data.append(unit) sum=0 for i in l: if i in url: print(i," ",all_data[url.index(i)]) start.append(i) end.append(all_data[url.index(i)]) else: if i in ip: print(i," ",un[ip.index(i)]) start.append(i) end.append(all_data[ip.index(i)]) list = os.listdir(dir) for i in range(0, len(list)): path = os.path.join(dir, list[i]) if os.path.isfile(path): with open(path, encoding="utf-8") as f: content = f.read() doc = pq(content) # 解析html 文本 item = doc("h1") s=((item.eq(2).text())) res = urlparse(s) # print(res) if s in start: moveFile(path,"F:\scrapy\819\"+end[start.index(s)]+"\") else: if res.scheme == 'http' or res.scheme is None or res.scheme == '' or res.scheme == 'https': if res.netloc == '': # print(res.path) if res.path in start: moveFile(path, "F:\scrapy\819\"+end[start.index(res.path)]+"\") else: if res.netloc in start: moveFile(path, "F:\scrapy\819\"+end[start.index(res.netloc)]+"\") # list.append(res.netloc) # print(res.netloc) else: if res.scheme in start: moveFile(path, "F:\scrapy\819\"+end[start.index(res.scheme)]+"\") # listUrl.append(res.scheme) # print(res.scheme) print('操作完成') def chooseInfo(dir): l=[] listUrl=[] list = os.listdir(dir) for i in range(0, len(list)): path = os.path.join(dir, list[i]) if os.path.isfile(path): with open(path, encoding="utf-8") as f: content = f.read() doc = pq(content) # 解析html 文本 item = doc("h1") s=((item.eq(2).text())) #if int(item.eq(1).html()) > 0 or int(item.eq(3).html()) > 0: #parrten='^?([a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?.)+[a-zA-Z]{2,6}(/)' #a=re.findall('(?:[-w.]|(?:%[da-fA-F]{2}))+',s) #a=re.split('(?:[-w.]|(?:%[da-fA-F]{2}))+',a) #print(a) l.append(s) #print("操作全部完成!") for url in l: res=urlparse(url) #print(res) if res.scheme=='http' or res.scheme is None or res.scheme=='' or res.scheme=='https': if res.netloc =='': #print(res.path) listUrl.append(res.path) else: list.append(res.netloc) #print(res.netloc) else: listUrl.append(res.scheme) #print(res.scheme) return listUrl a=[] a=chooseInfo("C:\Users\yxb\Downloads\汇总高危\总\") searchdata(a,"C:\Users\yxb\Downloads\汇总高危\总\") #eq(a)