#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import urllib.request
import re
import sys
import time
class UserName:
def __init__(self):
self.url = "http://www.wangmingdaquan.cc/"
self.path = "C:\UsersAdministratorDesktop\username.log"
#self.path = "D:\username.log"
pass
def loop(self):
html = self.getPage(self.url)
#regex = '<a href="http://www.wangmingdaquan.cc/(.*?)/">.*?</a>'
#regex = '<a href="/(.*?)/" target="_blank"></a>'
regex = '<a href="http://www.wangmingdaquan.cc/(.*?)/">.*?</a>'
print(html)
page_list = self.comp(html, regex, 1) # return value is list typ
print (page_list)
sys.exit()
# username searching as as belowed
regex = '<p>(.*?)</p>'
count = 0
for u in page_list:
for n in range(60, 92):
for i in range(261124, 280000):
url = "www.wangmingdaquan.cc/" + u + "list_" + sti(n) +"_" + str(i) + ".html"
print("now, we are scraping '"+ url + "'")
html = self.getPage(url)
if html == '':
continue
n = self.comp(html, regex)
print("总数为: " + str(n))
count += n
print("catch name over, total number: " + str(count))
#print("catch name over,total number: " + str(count))
def getPage(self, url):
html = ''
try:
f = urllib.request.urlopen(url)
html = f.read().decode() # bytes -> str
except Exception as e:
print("getPage => " + str(e))
#return
return html
pass
def comp(self, html, regex, status=0):
pattern = re.compile(regex, re.M)
user_list = re.findall(pattern, html)
if status:
return user_list
count = self.save(user_list)
return count
def save(self, name_list):
for name in name_list:
self.write_config(name)
return len(name_list)
def write_config(self, data):
f = open(self.path, 'a')
try:
f.write(data.strip()+"
")
except Exception as e:
print("write_config => " + str(e))
finally:
f.close()
return
if __name__ == '__main__':
user = UserName()
user.loop()