zoukankan      html  css  js  c++  java
  • 交作业啊,python爬取58的页面

    第一次写博文,好紧张啊,写这么烂怎么给别人看啊
    先做下总结:
      刚开始学习python,自我感觉python写起来确实很方便,各种库,各种语法糖,不过刚接触,一下子记不下来这么多东西,总感觉乱乱的,用的多了熟悉之后应该就好了吧
      这次作业基本完成了作业的要求,但是由于是在上班期间抽时间写的,可能有些乱。个人感觉这次作业的难度不大,唯一麻烦的地方就是找浏览量接口,
      一开始我以为58会把浏览量的值直接发过来,通过选择器直接去修改页面的值,于是试了各种选择方式去js代码里搜,结果搜了半天也没找到,最后只好通过查看哪次网络请求之后浏览量就会出现的方式
      来定位出获取浏览量的接口,总结完了。后面是全部代码


    from collections import Iterator
    from bs4 import BeautifulSoup
    import requests
    import re
    import time
    
    header = {
        'Host': 'jst1.58.com',
        'User - Agent': 'Mozilla / 5.0 (Windows NT 6.1; WOW64;rv:46.0) Gecko / 20100101 Firefox / 46.0',
        'Accept': '* / *',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate',
        'Referer': 'http://bj.58.com/pingbandiannao/24063857671738x.shtml?adtype=1&entinfo=24063857671738_0&psid=157472432191615221843458468',
        'Cookie': 'id58=vDg6HFci7MbdAHOifdgN2Q==; utm_source=market; spm=b-31580022738699-me-f-824.bdpz_biaoti; mcity=bj; city=bj; 58home=bj; 58tj_uuid=e0c574ac-c792-4b29-a6b6-0add83b27579; new_session=0; new_uv=1; init_refer=http%253A%252F%252Fbzclk.baidu.com%252Fadrc.php%253Ft%253D0fKL00c00f7ngK60jUPi0nW_R0jDeaFg00000r7J01300000XL2vy9.THYdr0K85yF9pywdpAqVuNqsusK15y7BujD1mycknj0snWmzuHf0IHYvPHDYfWf4nDD4nYDYwRf1fW7DfWFjwjb4nWu7wRmkf6K95gTqFhdWpyfqnWm4rHc1nHT3niusThqbpyfqnHmhULFG5HDhTLNBULFG5iusThbqn6K-5y9YIZ0lQzqJIydsQhkdUhD8PH68mvqVQLnOTLKV5ycVn1Ddrj0snWcLn16vrHbVUyRVuBY3nWfhmv6qmhwsX-qBpy7EIAb0mLFW5HRYn1bL%2526tpl%253Dtpl_10085_12986_1%2526l%253D1040411361%2526wd%253D58%2526issp%253D1%2526f%253D8%2526ie%253Dutf-8%2526tn%253Dbaiduhome_pg%2526inputT%253D850; als=0; ipcity=bj%7C%u5317%u4EAC%7C0; myfeet_tooltip=end; bj58_new_session=0; bj58_init_refer=""; bj58_new_uv=1; bj58_id58s="UC1sYXcxM1I3ajhtMTY5OQ=="; sessionid=d8c0d2b1-ea07-4ace-b038-1b367908784c; final_history=24063857671738%2C25843657514315',
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0'
    }
    '''爬取详情页''' def getDetail(url): requests.adapters.DEFAULT_RETRIES = 5 infoid = re.sub('^.*/(.*)x\.shtml.*$', lambda m: '{}'.format(m.group(1)), url) '''网络不好时改用此方式,报错就两秒之后继续爬''' # while True: # try: # time.sleep(2) # response = requests.get(url) # count = requests.get('http://jst1.58.com/counter?infoid={}'.format(infoid), headers=header) # break # except BaseException: # print('错误') # pass time.sleep(2) response = requests.get(url) count = requests.get('http://jst1.58.com/counter?infoid={}'.format(infoid), headers=header) html = BeautifulSoup(response.text,'lxml') count = re.sub('^.*Counter58.total=(.*)$',lambda m:'{}'.format(m.group(1)),count.text) area = getFirst(html.select('#content div.col_sub.sumary ul li:nth-of-type(3) div.su_con span')) return { 'title' : getFirst(html.select('#content div.col_sub.mainTitle > h1')).get_text(), 'category' : getFirst(html.select('span.crb_i a:nth-of-type(1)')).get_text(),#span.crb_i:nth-child(3) 'time' : getFirst(html.select('#index_show li.time')).get_text(), 'price' : getFirst(html.select('#content div.col_sub.sumary li:nth-of-type(1) div.su_con span')).get_text(), 'chengse' : getFirst(getFirst(html.select('#content div.col_sub.sumary ul li:nth-of-type(2) div.su_con span')).stripped_strings), 'area' : ''.join('' if area == None else area.stripped_strings), 'count':count } '''获取列表或生成器的第一个值''' def getFirst(obj): if isinstance(obj, list) and len(obj) != 0 : return obj[0] if isinstance(obj, Iterator): return next(obj) return None '''爬取列表页''' def getListPage(): urls = ('http://bj.58.com/pbdn/0/pn{}/'.format(i) for i in range(1,200)) for url in urls: response = requests.get(url) html = BeautifulSoup(response.text, 'lxml') detailUrls = html.select('#infolist > table.tbimg > tr') '''如果为空,则说明页数超出,停止继续爬取''' if 0 == len(detailUrls): break for dUrl in detailUrls: if 'zzinfo' == getFirst(dUrl.get('class')): continue href = getFirst(dUrl.select('.img > a')).get('href') # 打印出来结果 print(getDetail(href)) getListPage()
    
    
  • 相关阅读:
    淘宝首页广告圆角切换标签未解之谜(vml)
    chrome的google官方在线安装太坑爹了,找到一个离线下载地址
    kejun写的响应性设计和开发
    HTTP状态码
    xwebkitspeech 语音输入功能
    Avoid Redirects 避免重定向
    webstorm下使用github
    开通了github,用webstorm上传,敲命令行太累。
    jQuery1.6.1下event鼠标事件有BUG,升级到1.7.1可以解决问题。
    从程序员到项目经理(五):不是人人都懂的学习要点
  • 原文地址:https://www.cnblogs.com/wsss/p/5464623.html
Copyright © 2011-2022 走看看