xpath

<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>xpath的基本语法</title> </head> <body> <script> /* * xpath 基本语法: * 1, 根节点 : / eg: /html/body/note/book * 2, 跨节点 : // eg: //book * 3, 使用下标: [],下标从1开始,而且使用的前提是 元素之间是同级关系。 eg://book[1] 可以! //title[1] 不可以! * 下标如果倒着来的话,要用last() eg: //book[last()] 倒数第一个 book[last()-1] 倒数第二个,以此类推 * 也可以这样: book[position()>1] * 4, 精确查找: 属性选择器 eg: //title[@name="zcb1"] * 上面四个都是获取的是标签。 * * 下面是获取具体的值 * 5, 标签包裹的 内容 :使用text(). --> string eg://book[2]/title/text() 这得到的才是字符串 * 6, 取标签中属性的 value : 使用@属性名 -->string eg : //book[1]/title/@name * */ </script> <note> <book> <title name="zcb1">哈利波特1</title> <price>100</price> </book> <book> <title>哈利波特2</title> <price>101</price> </book> <book> <title>哈利波特3</title> <price>102</price> </book> </note> </body> </html>
xpath 还有模糊查询...
c
xpath 练习:
https://movie.douban.com/top250

//ol[@class="grid_view"]/li//span[@class="title"][1]/text()

//ol/li//div[@class="hd"]/a/@href

//ol/li//div[@class="star"]/span[last()]

//a[text()='下一页>']/@href
lxml:
它也是一个第三方库:

1 from lxml import etree 2 3 def test(): 4 html_str = ''' 5 <!DOCTYPE html> 6 <html lang="en"> 7 <head> 8 <meta charset="UTF-8"> 9 <title>Title</title> 10 </head> 11 <body> 12 <div> 13 <ul> 14 <li class="item1"><a href="link1.html">first item</a></li> 15 <li class="item2"><a href="link2.html">ist item</a></li> 16 <li class="item3"><a href="link3.html">f item</a></li> 17 <li class="item4"><a href="link4.html">st item</a></li> 18 <li class="item5"><a href="link5.html">fi item</a></li> 19 <li class="item6"><a href="link6.html">tte item</a></li> 20 <li> <!-- 此处 li 标签未封闭 。但是它也是不影响结果的--> 21 </ul> 22 </div> 23 </body> 24 </html> 25 ''' 26 27 #1,将字符串 转为html 28 data = etree.HTML(html_str) 29 30 #2,调用xpath的语法进行解析 (xpath 解析出来 都是以列表进行展示! ) 31 result_obj = data.xpath("/html//ul/li") 32 print(result_obj) 33 34 #标签包裹的内容 35 content = data.xpath("//a[@href='link1.html']/text()") #第一个a 标签中包裹的内容 ['first item'] 36 print(content) 37 38 #标签的属性 39 value = data.xpath("//li[3]/@class") 40 print(value) 41 42 43 #了解 模糊查询 contains() #模糊查询 属性 class 包含 it 的所有li标签 44 ret = data.xpath("//li[contains(@class,'it')]") 45 print(ret) 46 47 48 #补充: 如果给的html_str 比较乱,我们可以对其进行格式化 一下! 它会将没有补全的标签自动补全,并且也会进行缩进! 49 # result_formated = etree.tostring(data).decode("utf8") # 50 # print(result_formated) 51 52 53 54 55 56 57 58 59 60 if __name__ == '__main__': 61 test()
xpath 和 lxml 的 代码使用步骤:

1 import requests 2 import random 3 4 USER_AGENT = [ 5 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 6 # IPhone 7 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 8 # IPod 9 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 10 # IPAD 11 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 12 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 13 # Android 14 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 15 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 16 # QQ浏览器 Android版本 17 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 18 # Android Opera Mobile 19 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 20 # Android Pad Moto Xoom 21 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 22 # BlackBerry 23 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 24 # WebOS HP Touchpad 25 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 26 # Nokia N97 27 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 28 # Windows Phone Mango 29 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 30 # UC浏览器 31 "UCWEB7.0.2.37/28/999", 32 "NOKIA5700/ UCWEB7.0.2.37/28/999", 33 # UCOpenwave 34 "Openwave/ UCWEB7.0.2.37/28/999", 35 # UC Opera 36 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999" 37 ] 38 39 #https://search.chinahr.com/bj/job/pn1/?key=python 40 class ChinaHRSpider(): 41 def __init__(self): 42 self.base_url = "https://search.chinahr.com/bj/job/pn1/" 43 self.headers = {"User-Agent":random.choice(USER_AGENT)} 44 45 #1,发送请求 46 def send_requests(self,params): 47 response = requests.get(self.base_url,headers =self.headers,params=params) 48 data_str = response.content.decode("utf8") #默认就是utf8 49 return data_str 50 51 #2,解析数据 使用xpath 来解析 52 def parase_data(self,data): 53 pass 54 55 #3,保存数据 56 def write_file(self,data): 57 with open("ChinaHR.html","w",encoding="utf8") as f: 58 f.write(data) 59 60 #4,调度方法 61 def run(self): 62 params = { 63 "key":"python" 64 } 65 data_str = self.send_requests(params) 66 self.write_file(data_str) 67 if __name__ == '__main__': 68 ChinaHRSpider().run()
下面解析上面拿到的html文档。

1 import requests 2 import random 3 from lxml import etree 4 5 USER_AGENT = [ 6 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 7 # IPhone 8 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 9 # IPod 10 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 11 # IPAD 12 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 13 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 14 # Android 15 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 16 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 17 # QQ浏览器 Android版本 18 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 19 # Android Opera Mobile 20 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 21 # Android Pad Moto Xoom 22 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 23 # BlackBerry 24 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 25 # WebOS HP Touchpad 26 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 27 # Nokia N97 28 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 29 # Windows Phone Mango 30 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 31 # UC浏览器 32 "UCWEB7.0.2.37/28/999", 33 "NOKIA5700/ UCWEB7.0.2.37/28/999", 34 # UCOpenwave 35 "Openwave/ UCWEB7.0.2.37/28/999", 36 # UC Opera 37 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999" 38 ] 39 40 #https://search.chinahr.com/bj/job/pn1/?key=python 41 class ChinaHRSpider(): 42 def __init__(self): 43 self.base_url = "https://search.chinahr.com/bj/job/pn1/" 44 self.headers = {"User-Agent":random.choice(USER_AGENT)} 45 46 #1,发送请求 47 def send_requests(self,params): 48 response = requests.get(self.base_url,headers =self.headers,params=params) 49 data_str = response.content.decode("utf8") #默认就是utf8 50 return data_str 51 52 #2,解析数据 使用xpath 来解析 53 def parase_data(self,data_str): 54 #a 转换类型 55 html_data = etree.HTML(data_str) 56 #b 调用xpath 方法 解析数据 57 58 59 60 #3,保存数据 61 def write_file(self,data): 62 with open("ChinaHR.html","w",encoding="utf8") as f: 63 f.write(data) 64 65 #4,调度方法 66 def run(self): 67 params = { 68 "key":"python" 69 } 70 data_str = self.send_requests(params) 71 self.write_file(data_str) 72 def test(): 73 with open("ChinaHR.html","r",encoding="utf8") as f: 74 data_str = f.read() 75 def parase_data(data_str): 76 #a 转换类型 77 html_data = etree.HTML(data_str) 78 #b 调用xpath 方法 解析数据 79 jobname_list = html_data.xpath('//div[@class="job-list-box"]//ul[1]/li/@title') 80 81 82 83 84 parase_data(data_str) 85 86 if __name__ == '__main__': 87 # ChinaHRSpider().run() 88 test()

1 import requests 2 import random 3 from lxml import etree 4 import json 5 6 USER_AGENT = [ 7 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 8 # IPhone 9 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 10 # IPod 11 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 12 # IPAD 13 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 14 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 15 # Android 16 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 17 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 18 # QQ浏览器 Android版本 19 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 20 # Android Opera Mobile 21 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 22 # Android Pad Moto Xoom 23 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 24 # BlackBerry 25 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 26 # WebOS HP Touchpad 27 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 28 # Nokia N97 29 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 30 # Windows Phone Mango 31 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 32 # UC浏览器 33 "UCWEB7.0.2.37/28/999", 34 "NOKIA5700/ UCWEB7.0.2.37/28/999", 35 # UCOpenwave 36 "Openwave/ UCWEB7.0.2.37/28/999", 37 # UC Opera 38 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999" 39 ] 40 #https://search.chinahr.com/bj/job/pn1/?key=python 41 class ChinaHRSpider(): 42 def __init__(self): 43 self.base_url = "https://search.chinahr.com/bj/job/pn1/" 44 self.headers = {"User-Agent":random.choice(USER_AGENT)} 45 46 #1,发送请求 47 def send_requests(self,params): 48 response = requests.get(self.base_url,headers =self.headers,params=params) 49 data_str = response.content.decode("utf8") #默认就是utf8 50 return data_str 51 52 #2,解析数据 53 def parase_data(self,data_str): #-->list 54 # a 转换类型 55 html_data = etree.HTML(data_str) 56 # b 调用xpath 方法 解析数据 57 divs = html_data.xpath('//div[@class="job-list-box"]/div') # divs 包含了我们需要的一行中的信息 58 job_list = [] 59 for div in divs: 60 temp = {} 61 job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0]) # string 62 fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip(" ") # string 63 job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[0] # string 64 jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[1] # string 65 xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[2] # string 66 company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip(" ") # string 67 data = { 68 "job_name": job_name, 69 "fabu_date": fabu_date, 70 "job_address": job_address, 71 "jingyan": jingyan, 72 "xueli": xueli, 73 "company_name": company_address 74 } 75 job_list.append(data) 76 return job_list 77 78 #3,保存数据 79 def write_file(self,data): 80 with open("ChinaHR.json","w",encoding="utf8") as f: 81 json.dump(data,f,ensure_ascii=False) 82 #4,调度方法 83 def run(self): 84 params = { 85 "key":"python" 86 } 87 data_str = self.send_requests(params) 88 ret = self.parase_data(data_str) 89 self.write_file(ret) 90 91 if __name__ == '__main__': 92 ChinaHRSpider().run()

1 import requests 2 import random 3 from lxml import etree 4 import json 5 6 USER_AGENT = [ 7 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36", 8 # IPhone 9 "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 10 # IPod 11 "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 12 # IPAD 13 "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 14 "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 15 # Android 16 "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 17 "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 18 # QQ浏览器 Android版本 19 "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 20 # Android Opera Mobile 21 "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 22 # Android Pad Moto Xoom 23 "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 24 # BlackBerry 25 "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 26 # WebOS HP Touchpad 27 "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 28 # Nokia N97 29 "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 30 # Windows Phone Mango 31 "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 32 # UC浏览器 33 "UCWEB7.0.2.37/28/999", 34 "NOKIA5700/ UCWEB7.0.2.37/28/999", 35 # UCOpenwave 36 "Openwave/ UCWEB7.0.2.37/28/999", 37 # UC Opera 38 "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999" 39 ] 40 #https://search.chinahr.com/bj/job/pn9//?key=python 41 class ChinaHRSpider(): 42 def __init__(self): 43 self.base_url = "https://search.chinahr.com/bj/job/pn" 44 self.headers = {"User-Agent":random.choice(USER_AGENT)} 45 self.job_list = [] 46 47 #1,发送请求 48 def send_requests(self,new_url,params): 49 response = requests.get(new_url,headers =self.headers,params=params) 50 data_str = response.content.decode("utf8") #默认就是utf8 51 return data_str 52 53 #2,解析数据 54 def parase_data(self,data_str): 55 # a 转换类型 56 html_data = etree.HTML(data_str) 57 # b 调用xpath 方法 解析数据 58 divs = html_data.xpath('//div[@class="job-list-box"]/div') # divs 包含了我们需要的一行中的信息 59 60 for div in divs: 61 temp = {} 62 job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0]) # string 63 fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip(" ") # string 64 job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[0] # string 65 jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[1] # string 66 xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip(" ").split('|')[2] # string 67 company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip(" ") # string 68 data = { 69 "job_name": job_name, 70 "fabu_date": fabu_date, 71 "job_address": job_address, 72 "jingyan": jingyan, 73 "xueli": xueli, 74 "company_name": company_address 75 } 76 self.job_list.append(data) 77 78 #3,保存数据 79 def write_file(self): 80 with open("ChinaHR.json","w",encoding="utf8") as f: 81 json.dump(self.job_list,f,ensure_ascii=False) 82 #4,调度方法 83 def run(self): 84 params = { 85 "key":"python" 86 } 87 for pageNum in range(1,10): #1-9 页 的数据! 88 new_url = self.base_url+str(pageNum)+"/" 89 data_str = self.send_requests(new_url,params) 90 self.parase_data(data_str) 91 self.write_file() 92 93 print("正在下载第{}页......".format(pageNum)) 94 95 if __name__ == '__main__': 96 ChinaHRSpider().run()