zoukankan      html  css  js  c++  java
  • python多线程抓取网页信息

    #!/usr/env  python
    #-*- coding: utf-8  -*-
    import urllib 
    import urllib2 
    import random 
    import requests
    import os,sys 
    import Queue
    import threading
    import time
    import MySQLdb
    from sgmllib import SGMLParser 
    import re
    queue = Queue.Queue()
    out_queue = Queue.Queue()
    num=0




    class ThreadUrl(threading.Thread):
        
        def __init__(self, queue, out_queue):
            threading.Thread.__init__(self)
            self.queue = queue
            self.out_queue = out_queue


        def run(self):
            while True:
                
    host = self.queue.get()
    print host
    try:
    html=requests.get(host)

    result=html.content
    html.close()
    self.out_queue.put(result)


                #place chunk into out queue
    except:
    print time.sleep(5)



                #signals to queue job is done
    self.queue.task_done()


    class DatamineThread(threading.Thread):
       
        def __init__(self, out_queue):
            threading.Thread.__init__(self)
            self.out_queue = out_queue


        def run(self):
            while True:
                
    result = self.out_queue.get()
    pattern=re.compile('<div class="appdiscrib">[sS]*?<h4>(.+?)</h4>')
    data0=re.findall(pattern,result)

    pattern=re.compile('版 本 号(.+?)</li>')
    data1=re.findall(pattern,result)
    pattern=re.compile('开 发 者(.+?)</li>')
    data2=re.findall(pattern,result)
    pattern=re.compile('发布时间(.+?)</li>')
    data3=re.findall(pattern,result)
    pattern=re.compile('文件大小(.+?)</li>')
    data4=re.findall(pattern,result)
    pattern=re.compile('支持固件(.+?)</li>')
    data5=re.findall(pattern,result)
    pattern=re.compile('应用介绍</h3>[sS]*?<div class="intro">([sS]*?)</div>')
    data6=re.findall(pattern,result)
    for items in data6:
    pass#print re.sub('<br />',' ',items)
    sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
    for items in data6:

    if(data5):
    values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
    else:
    values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
    #print values
    #print sql % values

    try:

    conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='addressbookdb',charset="utf8")
    cursor=conn.cursor() 
    cursor.execute(sql,values)
    conn.commit()
    except:
    print "error2"


    try:
    cursor.close()
    conn.close()
    except:
    print "error3"

    pattern=re.compile(' <div class="appTitle clearfix">[sS]*?<img src=(.+?)/>')
    data=re.findall(pattern,result)
    for j in data:
    print j
    global num
          
    try:
    temp=requests.get(j[1:-2])
    f=file("picture/"+str(num),"w+")
    num=num+1
    print num
    f.write(temp.content)
    except:
    print "error4"
     
               
    self.out_queue.task_done()
    def main():

    for k in range(1,2539):
    print k


    try:
    url="http://apk.gfan.com/apps_7_1_"+str(k)+".html"

    html=requests.get(url)

    result=html.content
    html.close()
    pattern=re.compile('<a href="([http://apk.gfan.com]?/Product/Appd{1,8}.html)"')
    dataresult=re.findall(pattern,result)
    dataresult=list(set(dataresult))


    for a in range(20):
    w = ThreadUrl(queue, out_queue)
    w.setDaemon(True)
    w.start()
    for i in dataresult:
    host="http://apk.gfan.com"+i

    queue.put(host)
    for a in range(20):
    dt = DatamineThread(out_queue)
    dt.setDaemon(True)
    dt.start()
    except:
    time.sleep(5)


    queue.join()
    out_queue.join()





    #sql="select * from address"
    #cursor.execute(sql)
    #conn.commit()
    #finalresult=cursor.fetchall()
    #if finalresult:
    #for x in finalresult:
    #pass #print x[0:]


        
    if  __name__=="__main__":
           main()

  • 相关阅读:
    线程交互
    线程死锁
    多线程的同步-sychronized
    线程常见方法
    创建多线程
    消费!
    Redis基本认识
    在右键菜单中加入"在IDEA中打开" (Open in IDEA)
    安装coc.nvim时 报[coc.nvim] javascript file not found 错误的解决方案
    汇编语言的种类
  • 原文地址:https://www.cnblogs.com/aukle/p/3225834.html
Copyright © 2011-2022 走看看