zoukankan      html  css  js  c++  java
  • select python fetch webpages not complete yet

    #!/usr/bin/env python
    #encoding=utf-8
    import select,socket,codecs,doctest,time,datetime,os
    def read_urls():
        urls=[]
        prefix="http://book.360buy.com/%s.html"
        for idx,line in enumerate(codecs.open("./book/1.csv","r","utf-8").readlines()):
            if idx==0:continue
            if idx>=100:break
        if line.find(",")==-1:continue
        #print line
            wid,name=line.rstrip().split(",",1)
            urls.append(prefix%wid)
        return urls

    def _parser(url):
        """
        >>> _parser("http://book.360buy.com/123.html")
        ('book.360buy.com','/123.html')
        """
        a,b=url[7:].split("/",1)
        return (a,"/"+b)

    def fetch(url):
        hostname,path=_parser(url)
        s=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
        #print "==>",s.fileno()
        #time.sleep(10)

        #addr=socket.gethostbyname(hostname)
        #print addr
        s.connect((hostname,80))
        html="""GET %s HTTP/1.0\r\n"""%path
        html+="""Host: %s\r\n"""%hostname
        html+="""User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0\r\n"""
        html+="""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"""
        html+="""Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n"""
        html+="""Cookie: BAIDUID=4782C3288E4A1689E0F8CBC0DF82BB1D:FG=1; BDUT=sc2x4782C3288E4A1689E0F8CBC0DF82BB1D13bda69e4000; H_PS_PSSID=1428_1667_1662\r\n"""
        html+="""Cache-Control: max-age=0\r\n"""
        html+="""\r\n"""
        f=None
        s.sendall(html)
        return s
        #rlist,wlist,elist=select.select([],[],[])

    def async_down(urls):
        sockets=[]
        dict={}
        files={}
        for url in urls:
           socket=fetch(url)
           dict[socket.fileno()]=url
           sockets.append(socket)
        #print sockets.__len__()
        #time.sleep(10)
        start=datetime.datetime.now()
        end=datetime.datetime.now()
        while sockets and (end-start).seconds<12:
           rlist,wlist,elist=select.select(sockets,[],[])
            #print "ready rlist:%s"%rlist.__len__()
        for s in rlist:
                data=s.recv(40960)
                if data:
                    #continue
                    f=None
                    if s.fileno() not in files:
                        f=codecs.open("./results/%s.html"%s.fileno(),"w","utf-8")
                files[s.fileno()]=f
            f=files[s.fileno()]
                    f.write(data.decode("gbk","ignore"))
                    f.flush()
                else:
                    sockets.remove(s)
            end=datetime.datetime.now()

        filenos=[socket.fileno() for socket in sockets]
        for fileno,f in files.iteritems():
            f.close()
            if fileno in filenos:
                os.remove("./results/%s.html"%fileno)

        print "left socket %s"%sockets.__len__()

    if __name__=="__main__":
        start=datetime.datetime.now()
        #doctest.testmod()
        urls=read_urls()
        #print urls
        async_down(urls)
        end=datetime.datetime.now()
        print (end-start).seconds

  • 相关阅读:
    Extjs4.0中清空filefield已选文件
    .net操作读取word中的图像并保存
    WebForm_PostBackOptions未定义 错误排查
    数据库关键字
    VS2008生成WebSite和WebApplication的区别(转载)
    安装天乙论坛(SSH架构的开源项目)时遇到的问题
    Hibernate与Oracle char类型的列之间的兼容问题
    关于spring3使用AOP编程时需要引入哪些jar包的问题
    让IE支持HTML5的Canvas
    IIS + TOMCAT 注意事项
  • 原文地址:https://www.cnblogs.com/lexus/p/2848479.html
Copyright © 2011-2022 走看看