2018-07-10
#coding:utf-8
#coding:utf-8 from lxml import etree import requests import pandas import time page = 1 while True: if page >= 1: time.sleep(1) url = 'http://www.bd-film.co/movies/index_' + str(page) + '1.htm' headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"} response = requests.get(url = url,headers = headers).content result = etree.HTML(response) page += 1 m_url = [] m_name = [] movie_url = result.xpath("//div[@class='text-overflow']//a/@href") movie_name = result.xpath("//div[@class='text-overflow']//a/@title") for i in movie_url: i = i+ str(' ') m_url.append(i) m_url = list(m_url) for n in movie_name: m_name.append(n) m_name = list(m_name) for (i1,i2) in zip(m_url,m_name): i3 = i1 + i2 print(i3) if page > 916: break else: break
爬虫简单脚本,后面会在这个基础上编写一个多线程。提高爬取速率。