zoukankan      html  css  js  c++  java
  • python爬虫爬取大众点评并导入redis

    直接上代码,导入redis的中文编码没有解决,日后解决了会第一时间上代码!新手上路,多多包涵!

    # -*- coding: utf-8 -*-
    import re
    import requests
    from time import sleep, ctime
    from urllib.request import urlopen
    from urllib.request import Request
    from lxml import etree
    import redis
    import MySQLdb
    
    
    r = redis.Redis(host='192.168.60.112', port=6379,db=0)#host自己的ip地址
    
    # 添加模拟浏览器协议头
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    for page in range(1,3):#爬取第1页到第3页。
      #大众点评链接,用了字符串拼串 url
    = "http://www.dianping.com/search/category/2/10/g112p%i"%(page)+"?aid=90308842%2C21171398%2C22974252%2C77259356%2C79709316%2C69011566%2C93070619%2C75101541%2C5724122%2C21559834&cpt=90308842%2C21171398%2C22974252%2C77259356%2C79709316%2C69011566%2C93070619%2C75101541%2C5724122%2C21559834&tc=1"#字符串拼接 # print(url) req_timeout = 5#延时 req = Request(url=url, headers=headers) f = urlopen(req, None, req_timeout) s = f.read() s = s.decode('utf-8') ss = str(s) # lxml提取 selector = etree.HTML(ss)
    #爬的内容 links
    = selector.xpath( '//div[@class="txt"]/div[@class="tit"]/a/@href|//div[@class="txt"]/div[@class="tit"]/a/h4/text()') for link in links: print(link)
    #写入redis,用的list类型(栈结构) r.lpush(
    'mylist',link)
  • 相关阅读:
    go context
    go etcd
    go logs
    go config
    go tail
    kafka与zookeeper
    go kafka
    go mysql
    array_map array_walk
    单独配置的nginx mysql 重启
  • 原文地址:https://www.cnblogs.com/wangyuhangboke/p/7857528.html
Copyright © 2011-2022 走看看