zoukankan      html  css  js  c++  java
  • Python爬虫:爬取某网站关键词对应商品ID,且存入DB2数据库

    公司研发不给力,我就自己写了一个,专门爬关键词对应的商品ID。

    其中还学会了用Python操作DB2数据库、Python发送邮件、写日志文件、处理浏览器访问限制。


    #!/usr/bin/python
    # -*- encoding:utf-8 -*-


    import requests
    from lxml import etree
    import ibm_db
    import logging
    import sys
    import time
    import smtplib

    #配置写入日志
    logging.basicConfig(level=logging.INFO,
    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
    datefmt='%a, %d %b %Y %H:%M:%S',
    filename='keywords_weekly.log',
    filemode='a')

    #编码
    reload(sys)
    sys.setdefaultencoding('utf-8')

    # 解决服务器限制访问问题
    def get_url_data(url,headers,max_tries=10):
    remaining_tries = max_tries
    while remaining_tries > 0:
    try:
    return requests.get(url,headers=headers)
    except requests.exceptions:
    time.sleep(60)
    remaining_tries = remaining_tries - 1
    raise Exception("Couldn't get the url_data.")



    #写入db2
    def write_db2(resultdict):
    rank=resultdict['rank']
    #由于中文编码问题,关键词直接用update的方法更新
    # keywords=resultdict['keywords']
    uv=resultdict['uv']
    frequency=resultdict['frequency']
    goods_id=resultdict['goods_id']
    sql_in="insert into T_KEYWORDS_weekly(K_RANK,UV,FREQUENCY,GOODS_ID,week_YEAR)"
    " values (%r,%r,%r,%r,year(current date)||'-'||WEEK_ISO(current date))" % (rank,uv,frequency,goods_id)
    ibm_db.exec_immediate(conn, sql_in)
    ibm_db.commit(conn)

    # #翻页
    def get_html(keywords):
    #keywords="沙发"
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/31.0.1650.63 Safari/537.36'
    headers = { 'User-Agent' : user_agent }
      #网址被我隐藏了哈,可以猜猜
    url='http://www.XXX.com/category-9999/list-p1/?fl=q&keywords=%s'%keywords
    html = get_url_data(url,headers)
    html_list=[html]
    selector = etree.HTML(html.text)
    #page_f="http://www.meilele.com"
    #得到共有多少页,循环得到各个页面的url
    page_e=selector.xpath('/html/body/div[@class="page-panel"]/div/div/span[@class="p-info"]/b/text()')
    if page_e:
    for i in range(2,int(page_e[0])+1):
    url_temp='http://www.meilele.com/category-9999/list-p%d/?fl=q&keywords=%s'%(i,keywords)
    html_temp=requests.get(url_temp,headers=headers)
    html_list.append(html_temp)
    return html_list

    # #获取内容
    def get_id(dictionary):
    keywords=dictionary[1]
    html_list=get_html(keywords)
    logging.info("get the html_list %s successfully" %keywords)
    for each in html_list:
    html=each
    selector = etree.HTML(html.text)
    result={}
    try:
    content_field = selector.xpath('//*[@id="JS_list_panel"]/div[@class="w list-wrap"]/ul[@class="list-goods clearfix"]')[0]
    except:
    content_field=[]
    result['rank']=str(dictionary[0])
    # result['keywords']=str(keywords)
    result['uv']=str(dictionary[2])
    result['frequency']=str(dictionary[3])
    result['goods_id']=str('')
    write_db2(result)
    else:
    for i in range(1,len(content_field)+1):
    goods_id = content_field.xpath('li[%d]/@data-goods-id'%i)[0]
    #return goods_id
    result['rank']=str(dictionary[0])
    # result['keywords']=str(keywords)
    result['uv']=str(dictionary[2])
    result['frequency']=str(dictionary[3])
    result['goods_id']=str(goods_id)
    write_db2(result)




    if __name__ == "__main__":
      #把密码也隐藏起来
    conn=ibm_db.connect("DATABASE=aedw;HOSTNAME=miranda;PORT=50000;PROTOCOL=TCPIP;UID=miranda; PWD=miranda;", "", "")
    #测试连接
    try:
    conn
    logging.info("connect to DB2 successfully")
    except:
    logging.info("couldn't connect to DB2")
    # #创建表
    # sql_create='create table T_keywords_weekly_TEMP like V_keywords_weekly'
    # stmt_create = ibm_db.exec_immediate(conn, sql_create)
    # try:
    # stmt_create
    # logging.info("create table T_keywords_weekly_TEMP successfully")
    # except:
    # logging.info("couldn't create table T_keywords_weekly_TEMP")
    # #插入表
    # sql_insert="insert into T_keywords_weekly_TEMP select * from V_keywords_weekly where rank>=100"
    # stmt_insert = ibm_db.exec_immediate(conn, sql_insert)
    # try:
    # stmt_insert
    # logging.info("insert into T_keywords_weekly_TEMP successfully")
    # except:
    # logging.info("couldn't insert into table T_keywords_weekly_TEMP")


    sql_select="select * from T_keywords_weekly_TEMP where rank>=162"
    stmt_select = ibm_db.exec_immediate(conn, sql_select)
    try:
    stmt_select
    logging.info("get the data from T_keywords_weekly_TEMP")
    except:
    logging.info("couldn't get the data from T_keywords_weekly_TEMP")
    else:
    dictionary = ibm_db.fetch_both(stmt_select)
    while dictionary != False:
    logging.info ('rank:'+str(dictionary[0])+ ' keywords:'+str(dictionary[1]))
    get_id(dictionary)
    dictionary = ibm_db.fetch_both(stmt_select) #这一句不能少啊
    # 更新关键字
    sql_update='''
    MERGE INTO T_KEYWORDS_weekly as tkm
    USING T_keywords_weekly_TEMP as tkmt
    ON tkm.K_RANK=tkmt.RANK
    and tkm.week_YEAR=year(current date)||'-'||WEEK_ISO(current date)
    WHEN MATCHED
    THEN UPDATE SET tkm.KEYWORDS=tkmt.KEYWORDS
    ELSE IGNORE
    '''
    stmt_update=ibm_db.exec_immediate(conn, sql_update)
    try:
    stmt_update
    logging.info("update the keywords")
    except:
    logging.info("couldn't update the keywords")

    # sql_drop="drop table T_keywords_weekly_TEMP"
    # stmt_drop = ibm_db.exec_immediate(conn, sql_drop)
    # try:
    # stmt_drop
    # logging.info("drop table T_keywords_weekly_TEMP successfully")
    # except:
    # logging.info("couldn't drop table T_keywords_weekly_TEMP")
    #
    # ibm_db.close(conn)
    # logging.info("close the connect!")

    # 配置发送邮件
    sender = 'tangxin2@meilele.com'
    receivers = ['tangxin2@meilele.com']
    SUBJECT = "Successfully update T_KEYWORDS_weekly"
    TEXT = '''
    Dear miranda,
    your python script of update T_KEYWORDS_weekly of last month have successed.
    may you have a good mood
    '''

    message = """
    From: %s
    To: %s
    Subject: %s

    %s
    """ % (sender, ", ".join(receivers), SUBJECT, TEXT)

    try:
    smtpObj = smtplib.SMTP('mail.meilele.com', 25)
    smtpObj.sendmail(sender, receivers, message)
    logging.info("Successfully sent email")
    except:
    logging.info("Error: unable to send email" )

    print('finish')
    
    
  • 相关阅读:
    zabbix 设备(自己的实践)
    10.24的注意事项——解决linux_jni编译错误的问题
    [Angular] Adding keyboard events to our control value accessor component
    [tmux] Copy and paste text from a tmux session
    [tmux] Customize tmux with tmux.conf
    [tmux] Zoom and resize to view a particular pane within tmux
    [tmux] Manage terminal workspaces using session naming
    [tmux] Reuse terminal workspaces using tmux sessions
    [tmux] Create collections of panes using tmux windows
    [tmux] Organize your terminal using tmux panes
  • 原文地址:https://www.cnblogs.com/miranda-tang/p/5508390.html
Copyright © 2011-2022 走看看