zoukankan      html  css  js  c++  java
  • 爬小说完本排行

    # 只爬2页 ,练习下
    import logging
    import requests
    import bs4
    import json
    import psycopg2
    from io import StringIO
    from urllib import parse
    # from proxy_ip import proxyip

    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s- %(message)s')

    blist = []

    def main(page):
    url = 'https://www.92qb.com/book/allvote/0/'+str(page)
    logging.info(url)
    html = request_douban(url)
    soup = bs4.BeautifulSoup(html,'lxml')
    #logging.debug(soup)
    save_to_txt(soup)

    def request_douban(url):
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/88.0.4324.146 Safari/537.36',
    }

    proxyooo = {'https':'182.84.144.12:3256'}
    print(proxyooo)
    logging.info('proxyip ' + str(proxyooo))
    try:
    target_response = requests.get(url,headers=headers)
    if target_response.status_code == 200:
    # utf-8编码
    target_response.encoding = 'gbk'
    # 获取网页信息
    target_html = target_response.text
    #print(target_html)
    return target_html
    except requests.RequestException:
    return None

    def save_to_txt(soup):
    booklist = soup.find(class_="clearfix rec_rullist").find_all("ul")
    logging.debug('booklist ' + str(booklist))
    for i in booklist:
    try:
    shuming = i.find(class_="two").string
    zuozhe = i.find(class_="four").string
    leixing = i.find(class_="sev").string
    zhishu = i.find(class_="five").string
    wanchengriqi = i.find(class_="six").string
    dizhi = i.find(class_="two").find( 'a', href=True)['href']
    #print(shuming, zuozhe, leixing, dizhi, zhishu, wanchengriqi)
    tap = (shuming, zuozhe, leixing, dizhi, zhishu, wanchengriqi)
    blist.append(' '.join('%s' % idd for idd in tap))
    logging.info(blist)
    # writedatabase(list)
    except:
    return None

    def writedatabase(blist):
    with open("config/kafka_and_postgres_config.json") as json_file:
    config = json.load(json_file)
    Conf = json.loads(json.dumps(config))
    pghost = Conf['postgres_host']
    pguser = Conf['postgres_user']
    pgpassword = Conf['postgres_password']
    pgdatabase = Conf['postgres_database']
    pgtable = Conf['postgres_table']

    s = ""
    for action in blist:
    s += action + ' '
    logging.info(s)

    conn = psycopg2.connect(host=pghost, user=pguser, password=pgpassword, database=pgdatabase)
    cur = conn.cursor()
    cur.copy_from(StringIO(s), pgtable, columns=('shuming', 'zuozhe', 'leixing', 'zhuangtai', 'dizhi'))
    conn.commit()
    cur.close()
    conn.close()
    list = []
    print('完成')


    if __name__=="__main__":
    for i in range(1,3):
    main(i)
  • 相关阅读:
    nodejs async waterfull 小白向
    nodejs async series 小白向
    MySQL 分区介绍总结
    cocos2d-x 一些实用的函数
    LeetCode(61)-Valid Palindrome
    ganglia错误解决
    (6)uboot具体解释——关闭缓存和mmu
    Linux下设置MySQL不区分大写和小写
    火狐与IE的7个JavaScript差异
    商业研究(8):汽车交通
  • 原文地址:https://www.cnblogs.com/fanpiao/p/15305364.html
Copyright © 2011-2022 走看看