zoukankan      html  css  js  c++  java
  • python爬虫 赶集网

    #coding=utf-8
    import requests
    from lxml import etree
    from sqlalchemy import create_engine
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy import Column, String, Integer
    from sqlalchemy.orm import sessionmaker

    def requests_view(response):
    import webbrowser
    requests_url = response.url
    base_url = '<head><base href="%s">' %(requests_url)
    base_url = base_url.encode('utf-8')
    content = response.content.replace(b"<head>",base_url)
    tem_html = open('tmp.html','wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab("tmp.html")

    host = "http://sz.ganji.com/fang1/o{}"
    max = 10

    engine = create_engine('mysql+mysqldb://root:root@192.168.33.30:3306/python?charset=utf8',echo=True,encoding='utf8')
    Base = declarative_base()

    class Ganji(Base):

    __tablename__ = 'ganji'

    id = Column(Integer, primary_key=True)
    title = Column(String(100))
    money = Column(String(100))
    info = Column(String(100))
    create_time = Column(String(30))


    def __repr__(self):
    return '%s(%r)' % (self.__class__.__name__, self.username)
    # Base.metadata.create_all(engine)
    # exit()
    def save_data(title,money,info):
    # 创建session对象:
    DBSession = sessionmaker(bind=engine)
    session = DBSession()
    # 创建新User对象:
    import datetime
    create_time = datetime.datetime.now()
    new_ganji = Ganji( title=title,money=money,info=info,create_time="test")
    # 添加到session:
    session.add(new_ganji)
    # 提交即保存到数据库:
    session.commit()
    # 关闭session:
    session.close()

    def get_html(url):
    headers = {'Referer':'http://callback.ganji.com/firewall/valid/1902788594.do?namespace=ganji_zufang_list_pc&url=http%3A%2F%2Fsz.ganji.com%2Ffang1%2F','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
    #requests_view(response)
    #strip
    html = etree.HTML(response.content.decode('utf-8'))
    items = html.xpath(".//div[@class='f-main-list']/div/div")
    print(len(items))
    for i in items:
    title = i.xpath(".//dd[@class='dd-item title']/a/text()")
    money = i.xpath(".//dd[@class='dd-item info']/div[@class='price']/span/text()")
    info = i.xpath(".//dd[@class='dd-item size']/span/text()")
    print(info)
    title = ' '.join(title)
    money = ' '.join(money)
    info = ' '.join(info)
    if len(title) > 0 and len(money) >0 and len(info) > 0 :
    save_data(title,money,info)
    else:
    print("未获取到数据");

    else:
    print("请求失败")
    try:
    for i in range(1,max):
    url = host.format(i)
    print(url)
    get_html(url)
    except Exception as e:
    print(str(e))


  • 相关阅读:
    0421 & SX2016
    HDU3948 & 回文树模板
    BZOJ 2152 & 点分治
    HDU5618 & CDQ分治
    CC countari & 分块+FFT
    ECF R9(632E) & FFT
    ECF R9(632E) & DP
    BZOJ的两道osu概率DP easy与osu
    BZOJ3197 & 组合乱搞
    转载 Rational Rose 的配置和破解
  • 原文地址:https://www.cnblogs.com/brady-wang/p/8931219.html
Copyright © 2011-2022 走看看