zoukankan      html  css  js  c++  java
  • 爬虫 cast day04

    xpath 

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>xpath的基本语法</title>
    </head>
    <body>
        <script>
            /*
            *     xpath 基本语法:
            *       1, 根节点 : /  eg: /html/body/note/book
            *       2, 跨节点 : // eg: //book
            *       3, 使用下标: [],下标从1开始,而且使用的前提是 元素之间是同级关系。 eg://book[1] 可以! //title[1] 不可以!
            *                       下标如果倒着来的话,要用last()  eg:  //book[last()] 倒数第一个   book[last()-1] 倒数第二个,以此类推
            *                        也可以这样:  book[position()>1]
            *       4, 精确查找: 属性选择器 eg:  //title[@name="zcb1"]
            *       上面四个都是获取的是标签。
            *
            *       下面是获取具体的值
            *       5, 标签包裹的 内容 :使用text().  -->  string  eg://book[2]/title/text()   这得到的才是字符串
            *       6, 取标签中属性的 value  : 使用@属性名 -->string  eg : //book[1]/title/@name
            * */
        </script>
        <note>
            <book>
                <title name="zcb1">哈利波特1</title>
                <price>100</price>
            </book>
            <book>
                <title>哈利波特2</title>
                <price>101</price>
            </book>
            <book>
                <title>哈利波特3</title>
                <price>102</price>
            </book>
        </note>
    </body>
    </html>
    xpath 的基本语法

    xpath 还有模糊查询...   

    c

    xpath 练习:

    https://movie.douban.com/top250

    //ol[@class="grid_view"]/li//span[@class="title"][1]/text()
    获取里面的电影的名字
    //ol/li//div[@class="hd"]/a/@href
    获取链接
    //ol/li//div[@class="star"]/span[last()]
    评价人数

    https://www.baidu.com

    //a[text()='下一页>']/@href
    根据文本内容选择 标签,实现翻页功能。

    lxml:

    它也是一个第三方库:

     1 from lxml  import etree
     2 
     3 def test():
     4     html_str = '''
     5 <!DOCTYPE html>
     6 <html lang="en">
     7 <head>
     8     <meta charset="UTF-8">
     9     <title>Title</title>
    10 </head>
    11 <body>
    12     <div>
    13         <ul>
    14             <li class="item1"><a href="link1.html">first item</a></li>
    15             <li class="item2"><a href="link2.html">ist item</a></li>
    16             <li class="item3"><a href="link3.html">f item</a></li>
    17             <li class="item4"><a href="link4.html">st item</a></li>
    18             <li class="item5"><a href="link5.html">fi item</a></li>
    19             <li class="item6"><a href="link6.html">tte item</a></li>
    20             <li>    <!-- 此处 li 标签未封闭 。但是它也是不影响结果的--> 
    21         </ul>
    22     </div>
    23 </body>
    24 </html>
    25     '''
    26 
    27     #1,将字符串 转为html
    28     data  = etree.HTML(html_str)
    29 
    30     #2,调用xpath的语法进行解析  (xpath 解析出来 都是以列表进行展示! )
    31     result_obj = data.xpath("/html//ul/li")
    32     print(result_obj)
    33 
    34     #标签包裹的内容
    35     content = data.xpath("//a[@href='link1.html']/text()") #第一个a 标签中包裹的内容  ['first item']
    36     print(content)
    37 
    38     #标签的属性
    39     value = data.xpath("//li[3]/@class")
    40     print(value)
    41 
    42 
    43     #了解 模糊查询  contains()   #模糊查询 属性 class 包含 it 的所有li标签
    44     ret = data.xpath("//li[contains(@class,'it')]")
    45     print(ret)
    46 
    47 
    48     #补充:  如果给的html_str 比较乱,我们可以对其进行格式化 一下!   它会将没有补全的标签自动补全,并且也会进行缩进!  
    49     # result_formated = etree.tostring(data).decode("utf8") #
    50     # print(result_formated)
    51 
    52 
    53 
    54 
    55 
    56 
    57 
    58 
    59 
    60 if __name__ == '__main__':
    61     test()
    View Code

    xpath  和 lxml 的 代码使用步骤:

     1 import requests
     2 import random
     3 
     4 USER_AGENT = [
     5     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     6     # IPhone
     7     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
     8     # IPod
     9     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPAD
    11     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    12     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    13     # Android
    14     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    15     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    16     # QQ浏览器 Android版本
    17     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # Android Opera Mobile
    19     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    20     # Android Pad Moto Xoom
    21     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    22     # BlackBerry
    23     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    24     # WebOS HP Touchpad
    25     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    26     # Nokia N97
    27     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    28     # Windows Phone Mango
    29     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    30     # UC浏览器
    31     "UCWEB7.0.2.37/28/999",
    32     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    33     # UCOpenwave
    34     "Openwave/ UCWEB7.0.2.37/28/999",
    35     # UC Opera
    36     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    37 ]
    38 
    39 #https://search.chinahr.com/bj/job/pn1/?key=python
    40 class ChinaHRSpider():
    41     def __init__(self):
    42         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    43         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    44 
    45     #1,发送请求
    46     def send_requests(self,params):
    47         response =  requests.get(self.base_url,headers =self.headers,params=params)
    48         data_str = response.content.decode("utf8")  #默认就是utf8
    49         return data_str
    50 
    51     #2,解析数据  使用xpath 来解析
    52     def parase_data(self,data):
    53         pass
    54 
    55     #3,保存数据
    56     def write_file(self,data):
    57         with open("ChinaHR.html","w",encoding="utf8") as f:
    58             f.write(data)
    59 
    60     #4,调度方法
    61     def run(self):
    62         params = {
    63             "key":"python"
    64         }
    65         data_str = self.send_requests(params)
    66         self.write_file(data_str)
    67 if __name__ == '__main__':
    68     ChinaHRSpider().run()
    中华英才网 爬虫 测试

    下面解析上面拿到的html文档。

     1 import requests
     2 import random
     3 from lxml import etree
     4 
     5 USER_AGENT = [
     6     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     7     # IPhone
     8     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
     9     # IPod
    10     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    11     # IPAD
    12     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    14     # Android
    15     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    16     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     # QQ浏览器 Android版本
    18     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    19     # Android Opera Mobile
    20     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    21     # Android Pad Moto Xoom
    22     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    23     # BlackBerry
    24     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    25     # WebOS HP Touchpad
    26     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    27     # Nokia N97
    28     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    29     # Windows Phone Mango
    30     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    31     # UC浏览器
    32     "UCWEB7.0.2.37/28/999",
    33     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    34     # UCOpenwave
    35     "Openwave/ UCWEB7.0.2.37/28/999",
    36     # UC Opera
    37     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    38 ]
    39 
    40 #https://search.chinahr.com/bj/job/pn1/?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45 
    46     #1,发送请求
    47     def send_requests(self,params):
    48         response =  requests.get(self.base_url,headers =self.headers,params=params)
    49         data_str = response.content.decode("utf8")  #默认就是utf8
    50         return data_str
    51 
    52     #2,解析数据  使用xpath 来解析
    53     def parase_data(self,data_str):
    54         #a 转换类型
    55         html_data = etree.HTML(data_str)
    56         #b 调用xpath 方法  解析数据
    57 
    58 
    59 
    60     #3,保存数据
    61     def write_file(self,data):
    62         with open("ChinaHR.html","w",encoding="utf8") as f:
    63             f.write(data)
    64 
    65     #4,调度方法
    66     def run(self):
    67         params = {
    68             "key":"python"
    69         }
    70         data_str = self.send_requests(params)
    71         self.write_file(data_str)
    72 def test():
    73     with open("ChinaHR.html","r",encoding="utf8") as f:
    74         data_str = f.read()
    75     def parase_data(data_str):
    76         #a 转换类型
    77         html_data = etree.HTML(data_str)
    78         #b 调用xpath 方法  解析数据
    79         jobname_list =  html_data.xpath('//div[@class="job-list-box"]//ul[1]/li/@title')
    80  
    81 
    82 
    83 
    84     parase_data(data_str)
    85 
    86 if __name__ == '__main__':
    87     # ChinaHRSpider().run()
    88     test()
    将取出的数据放到列表中,这样不太好,应该放到一个字典中
     1 import requests
     2 import random
     3 from lxml import etree
     4 import json
     5 
     6 USER_AGENT = [
     7     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     8     # IPhone
     9     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPod
    11     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    12     # IPAD
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    14     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    15     # Android
    16     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # QQ浏览器 Android版本
    19     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    20     # Android Opera Mobile
    21     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    22     # Android Pad Moto Xoom
    23     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    24     # BlackBerry
    25     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    26     # WebOS HP Touchpad
    27     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    28     # Nokia N97
    29     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    30     # Windows Phone Mango
    31     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    32     # UC浏览器
    33     "UCWEB7.0.2.37/28/999",
    34     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    35     # UCOpenwave
    36     "Openwave/ UCWEB7.0.2.37/28/999",
    37     # UC Opera
    38     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    39 ]
    40 #https://search.chinahr.com/bj/job/pn1/?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45 
    46     #1,发送请求
    47     def send_requests(self,params):
    48         response =  requests.get(self.base_url,headers =self.headers,params=params)
    49         data_str = response.content.decode("utf8")  #默认就是utf8
    50         return data_str
    51 
    52     #2,解析数据
    53     def parase_data(self,data_str):   #-->list
    54         # a 转换类型
    55         html_data = etree.HTML(data_str)
    56         # b 调用xpath 方法  解析数据
    57         divs = html_data.xpath('//div[@class="job-list-box"]/div')  # divs 包含了我们需要的一行中的信息
    58         job_list = []
    59         for div in divs:
    60             temp = {}
    61             job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0])  # string
    62             fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("
     ")  # string
    63             job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[0]  # string
    64             jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[1]  # string
    65             xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[2]  # string
    66             company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("
     ")  # string
    67             data = {
    68                 "job_name": job_name,
    69                 "fabu_date": fabu_date,
    70                 "job_address": job_address,
    71                 "jingyan": jingyan,
    72                 "xueli": xueli,
    73                 "company_name": company_address
    74             }
    75             job_list.append(data)
    76         return job_list
    77 
    78     #3,保存数据
    79     def write_file(self,data):
    80         with open("ChinaHR.json","w",encoding="utf8") as f:
    81             json.dump(data,f,ensure_ascii=False)
    82     #4,调度方法
    83     def run(self):
    84         params = {
    85             "key":"python"
    86         }
    87         data_str = self.send_requests(params)
    88         ret = self.parase_data(data_str)
    89         self.write_file(ret)
    90 
    91 if __name__ == '__main__':
    92     ChinaHRSpider().run()
    将取出的 数据 放到一个字典中, 最后将字典放到列表中,最后保存成Json文件!
     1 import requests
     2 import random
     3 from lxml import etree
     4 import json
     5 
     6 USER_AGENT = [
     7     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     8     # IPhone
     9     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPod
    11     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    12     # IPAD
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    14     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    15     # Android
    16     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # QQ浏览器 Android版本
    19     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    20     # Android Opera Mobile
    21     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    22     # Android Pad Moto Xoom
    23     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    24     # BlackBerry
    25     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    26     # WebOS HP Touchpad
    27     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    28     # Nokia N97
    29     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    30     # Windows Phone Mango
    31     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    32     # UC浏览器
    33     "UCWEB7.0.2.37/28/999",
    34     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    35     # UCOpenwave
    36     "Openwave/ UCWEB7.0.2.37/28/999",
    37     # UC Opera
    38     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    39 ]
    40 #https://search.chinahr.com/bj/job/pn9//?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45         self.job_list = []
    46 
    47     #1,发送请求
    48     def send_requests(self,new_url,params):
    49         response =  requests.get(new_url,headers =self.headers,params=params)
    50         data_str = response.content.decode("utf8")  #默认就是utf8
    51         return data_str
    52 
    53     #2,解析数据
    54     def parase_data(self,data_str):
    55         # a 转换类型
    56         html_data = etree.HTML(data_str)
    57         # b 调用xpath 方法  解析数据
    58         divs = html_data.xpath('//div[@class="job-list-box"]/div')  # divs 包含了我们需要的一行中的信息
    59 
    60         for div in divs:
    61             temp = {}
    62             job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0])  # string
    63             fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("
     ")  # string
    64             job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[0]  # string
    65             jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[1]  # string
    66             xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[2]  # string
    67             company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("
     ")  # string
    68             data = {
    69                 "job_name": job_name,
    70                 "fabu_date": fabu_date,
    71                 "job_address": job_address,
    72                 "jingyan": jingyan,
    73                 "xueli": xueli,
    74                 "company_name": company_address
    75             }
    76             self.job_list.append(data)
    77 
    78     #3,保存数据
    79     def write_file(self):
    80         with open("ChinaHR.json","w",encoding="utf8") as f:
    81             json.dump(self.job_list,f,ensure_ascii=False)
    82     #4,调度方法
    83     def run(self):
    84         params = {
    85             "key":"python"
    86         }
    87         for pageNum in range(1,10): #1-9  页 的数据!
    88             new_url = self.base_url+str(pageNum)+"/"
    89             data_str = self.send_requests(new_url,params)
    90             self.parase_data(data_str)
    91             self.write_file()
    92 
    93             print("正在下载第{}页......".format(pageNum))
    94 
    95 if __name__ == '__main__':
    96     ChinaHRSpider().run()
    爬取多页之第一种:固定循环次数!

    bs4 

  • 相关阅读:
    机器学习(深度学习)
    机器学习(六)
    机器学习一-三
    Leetcode 90. 子集 II dfs
    Leetcode 83. 删除排序链表中的重复元素 链表操作
    《算法竞赛进阶指南》 第二章 Acwing 139. 回文子串的最大长度
    LeetCode 80. 删除有序数组中的重复项 II 双指针
    LeetCode 86 分割链表
    《算法竞赛进阶指南》 第二章 Acwing 138. 兔子与兔子 哈希
    《算法竞赛进阶指南》 第二章 Acwing 137. 雪花雪花雪花 哈希
  • 原文地址:https://www.cnblogs.com/zach0812/p/12005571.html
Copyright © 2011-2022 走看看