# -*- coding: utf-8 -*- # 八个城市url:(https: // www.58.com / changecity.html?catepath=zhuangxiujc.shtml & catename= % E8 % A3 % 85 % E4 % BF % AE % E5 % BB % BA % E6 % 9D % 90 & fullpath=26509 & PGTID=0d20678d-0000-38aa-12e6-c410f82e0e6b & ClickID=1) # 济南,https: // jn.58.com / zhuangxiujc.shtml # 泰安,https: // ta.58.com / zhuangxiujc.shtml # 青岛,https: // qd.58.com / zhuangxiujc.shtml # 深圳,https: // sz.58.com / zhuangxiujc.shtml # 广州,https: // gz.58.com / zhuangxiujc.shtml # 珠海,https: // zh.58.com / zhuangxiujc.shtml # 佛山,https: // fs.58.com / zhuangxiujc.shtml # 合肥,https: // hf.58.com / zhuangxiujc.shtml # 本地服务的url # https: // fs.58.com / zhuangxiujc.shtml?PGTID = 0d300261 - 000d - e7ff - 41a5 - 94b92c564bb2 & ClickID = 1 # 然后获取网页内的六个网页url, 分别要:(如果是要全部的url 每个网页是没有100页的,这样要爬取的网页翻倍增加,) # 装修建房 # 家装服务 / jiazhuang / # 店铺楼宇装修 / gongzhuang / # 建房翻新改造 / fanjiangaizao / # 建材 / 工具 # 建材工具购买 / jiancai / # 家具 / 家饰 # 家具定制 / 购买 / jiajusp / # 家纺家饰 / jiajuzs / # https: // fs.58.com / jiajuzs /?PGTID = 0d20678d - 000d - e495 - 0620 - 6d23e2a9a7c5 & ClickID = 2 # # 进入这六个网页爬取网页内每个企业数据,要100页数据(指不定没有100页) # 公司名称 # 电话(电话要自动化点击后获取 电话('/html/body/div[14]/div/div[1]/div[1]') 关闭('/html/body/div[14]/div/div[2]')) # 名称 新房, 二手房翻新, 免费设计, 居家装饰, 先装修后付款 # 描述 # url import scrapy from wuba.items import WubaItem import requests import time class WubasjSpider(scrapy.Spider): name = 'wubasj' # allowed_domains = ['fs.58.com'] start_urls = ['https://jn.58.com/zhuangxiujc.shtml'] # start_urlss = ['https://jn.58.com/zhuangxiujc.shtml', # 'https://ta.58.com/zhuangxiujc.shtml', # 'https://qd.58.com/zhuangxiujc.shtml', # 'https://sz.58.com/zhuangxiujc.shtml', # 'https://gz.58.com/zhuangxiujc.shtml', # 'https://zh.58.com/zhuangxiujc.shtml', # 'https://fs.58.com/zhuangxiujc.shtml', # 'https://hf.58.com/zhuangxiujc.shtml'] def parse(self, response): classifys = response.xpath("//dl[@class='nav-content__catebox__sidebar--cateitem _catecss-item']") for classify in classifys[:-3]: url = classify.xpath("./dt/a/@href").extract()[0] url = 'https://jn.58.com'+str(url) yield scrapy.Request(url=url,callback=self.issuer,meta={'pa_url':url}) # chengshi = response.xpath('//*[@id="content-box"]') # chengshi2 = response.xpath('//*[@id="content-box"]/div[1]/div/div[2]/a[1]/text()').extract() # xiangxi = chengshi.xpath('./div/div/div/a/@href').extract() # print(chengshi2,xiangxi) # k = response.xpath('/html/body/div[4]/div/div/div') # xx1 = k.xpath('./div[1]/dl/dd/a/text()').extract() # xx2 = k.xpath('./div[2]/dl/dd/a/text()').extract() # xx3 = k.xpath('./div[3]/dl/dd/a/text()').extract() # print(xx1,xx2,xx3) # items = WubaItem() # items_1 = [] # a = response.xpath('/html/body/div[4]/div/div/div') # lj1 = ['https://jn.58.com' + x for x in a.xpath('./div[1]/dl/dd/a/@href').extract()] # lj2 = ['https://jn.58.com' + x for x in a.xpath('./div[2]/dl/dd/a/@href').extract()] # lj3 = ['https://jn.58.com' + x for x in a.xpath('./div[3]/dl/dd/a/@href').extract()] # for lj in lj1,lj2,lj3: # items_1.append(lj) # items['bendi'] = lj[0] # yield scrapy.Request(url=items['bendi'], meta={'lj_lj': items},callback=self.parse_sj) # print(items_1) # for items in items_1: # yield Request(url=items,callback=self.parse) # def parse_sj(self, response): # item = response.meta['lj_lj'] # 忘记接收上一级的数据 # print('+++++++++++555555555555555555++++++++++') # res = requests.get(response.url) # res.encoding = 'utf-8' # html = res.text # print(html) # print('+++++++++++555555555555555555++++++++++') # gongsi = response.xpath('//tbody/tr/td[2]/p[1]/text()').extract() # mingcheng = response.xpath('//tbody/tr/td[2]/a/text()').extract() # print(mingcheng,gongsi) def issuer(self,response): issuers = response.xpath("//table[@id='jingzhun']") for issuer in issuers: trs = issuer.xpath("./tr[@class='ac_item']") for tr in trs: url = tr.xpath("./td[@class='t']/div/a/@href").extract_first("") if url == '': url = tr.xpath("./td[@class='img']/div/a/@href").extract_first("") if url != '//fangxin.58.com/demand/form/quickpost?cateid=4063?from=pc_fangxin_zhuangxiu_listno1' and url != '': yield scrapy.Request(url=url,callback=self.deal,meta={'deal_url':url,'pa_url':response.meta.get('pa_url','')}) def deal(self, response): company_name = response.xpath("//div[@class='shopinfo__title']/h2/text()").extract_first("").strip() title = response.xpath("//div[@class='detail-title']/h1[@class='detail-title__name']/text()").extract_first("").strip() # category = response.xpath("//div[@class='infocard__container noswitch']/div[@class='infocard__container__item__main']/text()").extract() site = response.xpath("//div[@class='infocard__container__item infocard__container__item--shopaddress']/div[@class='infocard__container__item__main']/a/text()").extract() site = ''.join(site).strip() introduce = response.xpath("//div[@class='foldingbox']/article/text()").extract() url = response.meta.get('deal_url','') pa_url = response.meta.get('pa_url','') if introduce != []: introduce = ''.join(introduce) print(company_name) print(title) # print(category) print(site) print(introduce) item = WubaItem() item['bendi'] = pa_url item['gongsi'] = company_name item['mingcheng'] = title item['miaoshu'] = introduce item['lianjie'] = url yield item
...
具体代码: https://github.com/mysteriousKiller/58