import ssl, re,json
from urllib.request import urlopen
# 去掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context
def getPage(url):
response = urlopen(url)
return response.read().decode("utf-8")
def parsePage(d):
com = re.compile(r'<!-- 热推标签、埋点 -->.*?data-is_focus="(?:1)?" data-sl="">(?P<title>.*?)</a>'
r'.*?data-el="region">(?P<xiaoqu>.*?)</a>'
r'.*?</span>(?P<huxing>.*?)<span'
r'.*?/</span>(?P<mianji>.*?)<span'
r'.*?/</span>(?P<chaoxiang>.*?)<span'
r'.*?/</span>(?P<zhuangxiu>.*?)<'
r'(?:span class="divide">/</span>(?P<dianti>.*?)<)?'#()?括号里的东西出现0次或1次 (?:)表示取消()权限:findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可
r'.*?div class="positionInfo">(?P<flood>.*?)<span'
r'.*?/</span>(?P<floodtime>.*?)<span'
r'.*?target="_blank">(?P<diqu>.*?)</a>'
r'.*?class="followInfo">(?P<followInfo>.*?)<span'
r'.*?/</span>(?P<daikancishu>.*?)<div class="tag">'
r'(?:<span class="subway">(?P<subway>.*?)</span>)?'#可有可无
r'(?:<span class=".*?">(?P<fangben>.*?)</span>)?'#可有可无
r'(?:<span class="haskey">(?P<haskey>.*?)</span>)?'#可有可无
r'.*?<div class="totalPrice"><span>(?P<totalPrice>.*?)</div>'
r'.*?data-price=".*?"><span>(?P<unitPrice>.*?)</span>'
,re.S)
retsult=com.finditer(d)
for i in retsult:
yield {"title":i.group("title"),
"xiaoqu": i.group("xiaoqu"),
"huxing": i.group("huxing"),
"mianji": i.group("mianji"),
"chaoxiang": i.group("chaoxiang"),
"zhuangxiu": i.group("zhuangxiu"),
"dianti": i.group("dianti"),
"flood": i.group("flood"),
"floodtime": i.group("floodtime"),
"diqu": i.group("diqu"),
"followInfo": i.group("followInfo"),
"daikancishu": i.group("daikancishu"),
"subway": i.group("subway"),
"fangben": i.group("fangben"),
"haskey": i.group("haskey"),
"totalPrice": re.sub("</span>","",i.group("totalPrice")),
"unitPrice": i.group("unitPrice"),
}
f = open("lianjia_Second-hand house_info", mode="a", encoding="utf-8")
for i in range(100):
if i==0:
url="https://bj.lianjia.com/ershoufang/"
else:
url = "https://bj.lianjia.com/ershoufang/"+"pg%s" % (i+1)
print(url)
ret = parsePage(getPage(url))
for obj in ret:
data = json.dumps(obj, ensure_ascii=False)
print(data)
f.write(data + "
")
f.flush()
f.close()