zoukankan      html  css  js  c++  java
  • 【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)

     1 # coding=gbk  #因为在黑屏下执行,所以代码会使用GBK
     2 url='http://www.biquge.info/10_10218/'
     3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
     4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
     5     'Host':'www.xxbiquge.com',
     6     'Referer':'https://www.xxbiquge.com/2_2278/'}
     7 import time,lxml,pymysql
     8 from lxml import etree
     9 from urllib.request import Request
    10 from urllib.request import urlopen
    11 import os,sys,io
    12 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 
    13 
    14 def source(url):#获取源
    15    global UA
    16    text=urlopen(Request(url,None,UA),timeout=5)
    17    return text.read()
    18 
    19 def respon(text):#解析章
    20    global url
    21    seletor=etree.HTML(text)
    22    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")
    23    return url1
    24 
    25 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')
    26    global UA1
    27    for i in url:
    28       i='https://www.xxbiquge.com'+i
    29       a=urlopen(Request(i,None,UA1),timeout=5).read()
    30       seletor=etree.HTML(a)
    31       text=seletor.xpath('//*[@id="content"]/text()')#内容
    32       c=''
    33       for aa in text:
    34           c=c+aa
    35        
    36       text1=seletor.xpath('//html/head/title/text()')[0].split('-')[0]#章节名
    37       #print(i,type(i),text1,type(text1))
    38       mysqlw(c,i,text1)
    39       time.sleep(3)
    40 
    41 
    42 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'2.html')
    43 #with open(c,'r') as f:
    44 #   a=f.read()
    45 
    46 def mysqlw(text,url,chapter):#写内容
    47    b1=time.time()
    48    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
    49    cur=b.cursor()
    50    print(url,chapter,'w')
    51    
    52    #for i in cur.fetchall():
    53        #pass
    54    sql="""insert into douludalu(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)
    55    print(sql)
    56    try:
    57        cur.execute(sql)
    58        b.commit()
    59        print("插入成功")
    60    except Exception as e:
    61        print(e) 
    62        b.rollback()
    63    b.close()
    64    print("关闭",'耗时',time.time()-b1)
    65    
    66 def mysqlr(text):#读内容
    67    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
    68    cur=b.cursor()
    69    sql='select * from douludalu where html="%s%s%%s"'%(',text,')
    70    cur.execute(sql)
    71    print(sql)
    72    for i in cur.fetchall():
    73        a=i[0]
    74        b=i[3]
    75        print(a,b)
    76    
    77 #a='2唐三已经挥出了八千余锤,铁坨不断的变小,已经不到最初时三分'
    78 #mysqlw(a,'1.html','第一章') 
    79 def main():
    80    a=source('https://www.xxbiquge.com/2_2278/')
    81    b=respon(a)
    82    spider(b)
    83 #mysqlr('https://www.xxbiquge.com/2_2278/1036550.html')
    84 main()

     ——————————————————————————————————————————————————————————————————

     三寸天堂

     1 # coding=gbk
     2 url='http://www.biquge.info/10_10218/'
     3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
     4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
     5     'Host':'www.biquge.com.tw',
     6     'Referer':'http://www.biquge.com.tw/14_14055/',
     7     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
     8 import time,lxml,pymysql,threading
     9 from lxml import etree
    10 from urllib.request import Request
    11 from urllib.request import urlopen
    12 import os,sys,io
    13 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 
    14 
    15 def source(url):#获取源
    16    global UA
    17    text=urlopen(Request(url,None,UA),timeout=5)
    18    return text.read()
    19 
    20 def respon(text):#解析章
    21    global url
    22    seletor=etree.HTML(text)
    23    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")
    24    return url1
    25 
    26 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')
    27    global UA1
    28    i='http://www.biquge.com.tw/'+url
    29    print(i)
    30    a=urlopen(Request(i,None,UA1),timeout=5).read()
    31    if a is None:
    32        pass
    33    else:
    34        seletor=etree.HTML(a)
    35        text=seletor.xpath('//*[@id="content"]/text()')#内容
    36        c=''
    37        for aa in text:
    38            c=c+aa
    39        
    40        text1=seletor.xpath('//html/head/title/text()')[0]#章节名
    41    print(text1)
    42    #print(i,type(i),text1,type(text1))
    43    mysqlw(c,i,text1)
    44    time.sleep(3)
    45 
    46 
    47 def mysqlw(text,url,chapter):#写内容
    48    b1=time.time()
    49    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
    50    cur=b.cursor()
    51    print(url,chapter,'11111111111111111111111111111')
    52    #for i in cur.fetchall():
    53        #pass
    54    sql="""insert into suibian(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)
    55    try:
    56        cur.execute(sql)
    57        b.commit()
    58        print("插入成功")
    59    except Exception as e:
    60        print(e) 
    61        b.rollback()
    62    b.close()
    63    print("关闭",'耗时',time.time()-b1)
    64    
    65 def mysqlr(text):#读内容
    66    b1=True
    67    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
    68    cur=b.cursor()
    69    sql='select * from douludalu where html="%s%s%%s"'%(',text,')
    70    cur.execute(sql)
    71    print(sql)
    72    for i in cur.fetchall():
    73        a=i[0]
    74        b=i[3]
    75        print(a,b)
    76    if i[3] is None:
    77        b1=False
    78 
    79 def main():
    80    print(threading.current_thread().name)
    81    cc=time.time()
    82    print('开始时间%s'%cc)
    83    a=source('http://www.biquge.com.tw/14_14055/')
    84    b=respon(a)
    85    for i in b:
    86        #print(i)
    87        spider(i)
    88    ctime=time.time()-cc
    89    print('完成耗时%s'%ctime)
    90   
    91 
    92 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'1.html')
    93 #with open(c,'r') as f:
    94 #   a=f.read() 
    95 main()

    特别需要注意的是UA在Request中传值会出现错误,这时需要耐心来把问题解决

    容易出现的错误【

      1,协议中,referer错误,host错误

      2,网页xpath错误,目测此网站的网页还是比较规则的

    不是所有的成功都是坐享其成
  • 相关阅读:
    基于u盘身份验证
    新的一年开始了~!
    asp.net的条形码
    windows phone (21) Grid元素的Background和Clip
    windows phone (19) 深入了解TextBlock
    windows phone (25) Canvas元素B
    windows phone (20) Image元素
    windows phone (22) 隐藏元素
    windows phone (26) ApplicationBar应用程序栏
    windows phone (27) 基础Button
  • 原文地址:https://www.cnblogs.com/Skyda/p/9179420.html
Copyright © 2011-2022 走看看