功能点:scrapy基本使用
爬取网站:传智播客老师
完整代码:https://files.cnblogs.com/files/bookwed/first.zip
主要代码:
ff.py
# -*- coding: utf-8 -*- import scrapy from first.items import FirstItem class FfSpider(scrapy.Spider): #scrapy.Spider是最基本的类,必须继承这个类 # 爬虫名称 name = 'ff' # 允许的域名,可选 allowed_domains = ['itcast.cn'] start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] # 默认的Request对象回调函数,用来处理网页返回的response,以及生成Item或者Request对象 def parse(self, response): teacher_list = response.xpath("//div[@class='li_txt']") for teacher in teacher_list: # 创建item对象 item = FirstItem() # 此处由于疏忽,把teacher写成了item,结果找了半天 name = teacher.xpath("./h3/text()").extract() # xpath返回的是xpath对象,需要用extract提取字符串,同时,因为返回的是一个列表,所以要用[0]取值 level = teacher.xpath("./h4/text()").extract() desc = teacher.xpath("./p/text()").extract() item["name"] = name[0] item["level"] = level[0] item["desc"] = desc[0] yield item
pipelines.py
import json # 注意点:对应的settings配置,别忘了打开注释 # 可以做数据去重 class FirstPipeline(object): def __init__(self): self.f = open('teachers.json', 'w', encoding='utf-8') # 处理item def process_item(self, item, spider): print(dict(item)) content = json.dumps(dict(item), ensure_ascii=False) self.f.write(content+",") self.f.write(" ") return item def close_spider(self): self.f.close()