zoukankan html css js c++ java

python爬虫赶集网

#coding=utf-8
import requests
from lxml import etree
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, Integer
from sqlalchemy.orm import sessionmaker

def requests_view(response):
    import webbrowser
    requests_url = response.url
    base_url = '<head><base href="%s">' %(requests_url)
    base_url = base_url.encode('utf-8')
    content = response.content.replace(b"<head>",base_url)
    tem_html = open('tmp.html','wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab("tmp.html")

host  = "http://sz.ganji.com/fang1/o{}"
max = 10

engine = create_engine('mysql+mysqldb://root:root@192.168.33.30:3306/python?charset=utf8',echo=True,encoding='utf8')
Base = declarative_base()

class Ganji(Base):

    __tablename__ = 'ganji'

    id = Column(Integer, primary_key=True)
    title = Column(String(100))
    money = Column(String(100))
    info = Column(String(100))
    create_time  = Column(String(30))


    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.username)
# Base.metadata.create_all(engine)
# exit()
def save_data(title,money,info):
    # 创建session对象:
    DBSession = sessionmaker(bind=engine)
    session = DBSession()
    # 创建新User对象:
    import datetime
    create_time = datetime.datetime.now()
    new_ganji = Ganji( title=title,money=money,info=info,create_time="test")
    # 添加到session:
    session.add(new_ganji)
    # 提交即保存到数据库:
    session.commit()
    # 关闭session:
    session.close()

def get_html(url):
    headers = {'Referer':'http://callback.ganji.com/firewall/valid/1902788594.do?namespace=ganji_zufang_list_pc&url=http%3A%2F%2Fsz.ganji.com%2Ffang1%2F','User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        #requests_view(response)
        #strip
        html = etree.HTML(response.content.decode('utf-8'))
        items = html.xpath(".//div[@class='f-main-list']/div/div")
        print(len(items))
        for i in items:
            title = i.xpath(".//dd[@class='dd-item title']/a/text()")
            money = i.xpath(".//dd[@class='dd-item info']/div[@class='price']/span/text()")
            info = i.xpath(".//dd[@class='dd-item size']/span/text()")
            print(info)
            title = ' '.join(title)
            money = ' '.join(money)
            info = ' '.join(info)
            if len(title) > 0 and len(money) >0 and len(info) > 0 :
                save_data(title,money,info)
            else:
                print("未获取到数据");

    else:
        print("请求失败")
try:
    for i in range(1,max):
        url = host.format(i)
        print(url)
        get_html(url)
except Exception as e:
    print(str(e))

查看全文

相关阅读:
Linux Vim编辑器
 Linux sed 流编辑器
 Shell 编程（变量和条件测试）
Linux 下 Bash配置文件读取
 Linux 用户、权限
 Linux 指令（一）文件/目录操作
 Ubuntu 下安装 Swoole
Mysql IN语句查询
 Mysql 查询优化
 Mysql 获取表属性

原文地址：https://www.cnblogs.com/brady-wang/p/8931219.html

python爬虫 赶集网

python爬虫赶集网