zoukankan      html  css  js  c++  java
  • 爬虫 cast day04

    xpath 

    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>xpath的基本语法</title>
    </head>
    <body>
        <script>
            /*
            *     xpath 基本语法:
            *       1, 根节点 : /  eg: /html/body/note/book
            *       2, 跨节点 : // eg: //book
            *       3, 使用下标: [],下标从1开始,而且使用的前提是 元素之间是同级关系。 eg://book[1] 可以! //title[1] 不可以!
            *                       下标如果倒着来的话,要用last()  eg:  //book[last()] 倒数第一个   book[last()-1] 倒数第二个,以此类推
            *                        也可以这样:  book[position()>1]
            *       4, 精确查找: 属性选择器 eg:  //title[@name="zcb1"]
            *       上面四个都是获取的是标签。
            *
            *       下面是获取具体的值
            *       5, 标签包裹的 内容 :使用text().  -->  string  eg://book[2]/title/text()   这得到的才是字符串
            *       6, 取标签中属性的 value  : 使用@属性名 -->string  eg : //book[1]/title/@name
            * */
        </script>
        <note>
            <book>
                <title name="zcb1">哈利波特1</title>
                <price>100</price>
            </book>
            <book>
                <title>哈利波特2</title>
                <price>101</price>
            </book>
            <book>
                <title>哈利波特3</title>
                <price>102</price>
            </book>
        </note>
    </body>
    </html>
    xpath 的基本语法

    xpath 还有模糊查询...   

    c

    xpath 练习:

    https://movie.douban.com/top250

    //ol[@class="grid_view"]/li//span[@class="title"][1]/text()
    获取里面的电影的名字
    //ol/li//div[@class="hd"]/a/@href
    获取链接
    //ol/li//div[@class="star"]/span[last()]
    评价人数

    https://www.baidu.com

    //a[text()='下一页>']/@href
    根据文本内容选择 标签,实现翻页功能。

    lxml:

    它也是一个第三方库:

     1 from lxml  import etree
     2 
     3 def test():
     4     html_str = '''
     5 <!DOCTYPE html>
     6 <html lang="en">
     7 <head>
     8     <meta charset="UTF-8">
     9     <title>Title</title>
    10 </head>
    11 <body>
    12     <div>
    13         <ul>
    14             <li class="item1"><a href="link1.html">first item</a></li>
    15             <li class="item2"><a href="link2.html">ist item</a></li>
    16             <li class="item3"><a href="link3.html">f item</a></li>
    17             <li class="item4"><a href="link4.html">st item</a></li>
    18             <li class="item5"><a href="link5.html">fi item</a></li>
    19             <li class="item6"><a href="link6.html">tte item</a></li>
    20             <li>    <!-- 此处 li 标签未封闭 。但是它也是不影响结果的--> 
    21         </ul>
    22     </div>
    23 </body>
    24 </html>
    25     '''
    26 
    27     #1,将字符串 转为html
    28     data  = etree.HTML(html_str)
    29 
    30     #2,调用xpath的语法进行解析  (xpath 解析出来 都是以列表进行展示! )
    31     result_obj = data.xpath("/html//ul/li")
    32     print(result_obj)
    33 
    34     #标签包裹的内容
    35     content = data.xpath("//a[@href='link1.html']/text()") #第一个a 标签中包裹的内容  ['first item']
    36     print(content)
    37 
    38     #标签的属性
    39     value = data.xpath("//li[3]/@class")
    40     print(value)
    41 
    42 
    43     #了解 模糊查询  contains()   #模糊查询 属性 class 包含 it 的所有li标签
    44     ret = data.xpath("//li[contains(@class,'it')]")
    45     print(ret)
    46 
    47 
    48     #补充:  如果给的html_str 比较乱,我们可以对其进行格式化 一下!   它会将没有补全的标签自动补全,并且也会进行缩进!  
    49     # result_formated = etree.tostring(data).decode("utf8") #
    50     # print(result_formated)
    51 
    52 
    53 
    54 
    55 
    56 
    57 
    58 
    59 
    60 if __name__ == '__main__':
    61     test()
    View Code

    xpath  和 lxml 的 代码使用步骤:

     1 import requests
     2 import random
     3 
     4 USER_AGENT = [
     5     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     6     # IPhone
     7     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
     8     # IPod
     9     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPAD
    11     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    12     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    13     # Android
    14     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    15     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    16     # QQ浏览器 Android版本
    17     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # Android Opera Mobile
    19     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    20     # Android Pad Moto Xoom
    21     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    22     # BlackBerry
    23     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    24     # WebOS HP Touchpad
    25     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    26     # Nokia N97
    27     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    28     # Windows Phone Mango
    29     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    30     # UC浏览器
    31     "UCWEB7.0.2.37/28/999",
    32     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    33     # UCOpenwave
    34     "Openwave/ UCWEB7.0.2.37/28/999",
    35     # UC Opera
    36     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    37 ]
    38 
    39 #https://search.chinahr.com/bj/job/pn1/?key=python
    40 class ChinaHRSpider():
    41     def __init__(self):
    42         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    43         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    44 
    45     #1,发送请求
    46     def send_requests(self,params):
    47         response =  requests.get(self.base_url,headers =self.headers,params=params)
    48         data_str = response.content.decode("utf8")  #默认就是utf8
    49         return data_str
    50 
    51     #2,解析数据  使用xpath 来解析
    52     def parase_data(self,data):
    53         pass
    54 
    55     #3,保存数据
    56     def write_file(self,data):
    57         with open("ChinaHR.html","w",encoding="utf8") as f:
    58             f.write(data)
    59 
    60     #4,调度方法
    61     def run(self):
    62         params = {
    63             "key":"python"
    64         }
    65         data_str = self.send_requests(params)
    66         self.write_file(data_str)
    67 if __name__ == '__main__':
    68     ChinaHRSpider().run()
    中华英才网 爬虫 测试

    下面解析上面拿到的html文档。

     1 import requests
     2 import random
     3 from lxml import etree
     4 
     5 USER_AGENT = [
     6     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     7     # IPhone
     8     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
     9     # IPod
    10     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    11     # IPAD
    12     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    14     # Android
    15     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    16     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     # QQ浏览器 Android版本
    18     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    19     # Android Opera Mobile
    20     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    21     # Android Pad Moto Xoom
    22     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    23     # BlackBerry
    24     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    25     # WebOS HP Touchpad
    26     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    27     # Nokia N97
    28     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    29     # Windows Phone Mango
    30     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    31     # UC浏览器
    32     "UCWEB7.0.2.37/28/999",
    33     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    34     # UCOpenwave
    35     "Openwave/ UCWEB7.0.2.37/28/999",
    36     # UC Opera
    37     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    38 ]
    39 
    40 #https://search.chinahr.com/bj/job/pn1/?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45 
    46     #1,发送请求
    47     def send_requests(self,params):
    48         response =  requests.get(self.base_url,headers =self.headers,params=params)
    49         data_str = response.content.decode("utf8")  #默认就是utf8
    50         return data_str
    51 
    52     #2,解析数据  使用xpath 来解析
    53     def parase_data(self,data_str):
    54         #a 转换类型
    55         html_data = etree.HTML(data_str)
    56         #b 调用xpath 方法  解析数据
    57 
    58 
    59 
    60     #3,保存数据
    61     def write_file(self,data):
    62         with open("ChinaHR.html","w",encoding="utf8") as f:
    63             f.write(data)
    64 
    65     #4,调度方法
    66     def run(self):
    67         params = {
    68             "key":"python"
    69         }
    70         data_str = self.send_requests(params)
    71         self.write_file(data_str)
    72 def test():
    73     with open("ChinaHR.html","r",encoding="utf8") as f:
    74         data_str = f.read()
    75     def parase_data(data_str):
    76         #a 转换类型
    77         html_data = etree.HTML(data_str)
    78         #b 调用xpath 方法  解析数据
    79         jobname_list =  html_data.xpath('//div[@class="job-list-box"]//ul[1]/li/@title')
    80  
    81 
    82 
    83 
    84     parase_data(data_str)
    85 
    86 if __name__ == '__main__':
    87     # ChinaHRSpider().run()
    88     test()
    将取出的数据放到列表中,这样不太好,应该放到一个字典中
     1 import requests
     2 import random
     3 from lxml import etree
     4 import json
     5 
     6 USER_AGENT = [
     7     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     8     # IPhone
     9     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPod
    11     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    12     # IPAD
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    14     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    15     # Android
    16     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # QQ浏览器 Android版本
    19     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    20     # Android Opera Mobile
    21     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    22     # Android Pad Moto Xoom
    23     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    24     # BlackBerry
    25     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    26     # WebOS HP Touchpad
    27     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    28     # Nokia N97
    29     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    30     # Windows Phone Mango
    31     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    32     # UC浏览器
    33     "UCWEB7.0.2.37/28/999",
    34     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    35     # UCOpenwave
    36     "Openwave/ UCWEB7.0.2.37/28/999",
    37     # UC Opera
    38     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    39 ]
    40 #https://search.chinahr.com/bj/job/pn1/?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn1/"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45 
    46     #1,发送请求
    47     def send_requests(self,params):
    48         response =  requests.get(self.base_url,headers =self.headers,params=params)
    49         data_str = response.content.decode("utf8")  #默认就是utf8
    50         return data_str
    51 
    52     #2,解析数据
    53     def parase_data(self,data_str):   #-->list
    54         # a 转换类型
    55         html_data = etree.HTML(data_str)
    56         # b 调用xpath 方法  解析数据
    57         divs = html_data.xpath('//div[@class="job-list-box"]/div')  # divs 包含了我们需要的一行中的信息
    58         job_list = []
    59         for div in divs:
    60             temp = {}
    61             job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0])  # string
    62             fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("
     ")  # string
    63             job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[0]  # string
    64             jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[1]  # string
    65             xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[2]  # string
    66             company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("
     ")  # string
    67             data = {
    68                 "job_name": job_name,
    69                 "fabu_date": fabu_date,
    70                 "job_address": job_address,
    71                 "jingyan": jingyan,
    72                 "xueli": xueli,
    73                 "company_name": company_address
    74             }
    75             job_list.append(data)
    76         return job_list
    77 
    78     #3,保存数据
    79     def write_file(self,data):
    80         with open("ChinaHR.json","w",encoding="utf8") as f:
    81             json.dump(data,f,ensure_ascii=False)
    82     #4,调度方法
    83     def run(self):
    84         params = {
    85             "key":"python"
    86         }
    87         data_str = self.send_requests(params)
    88         ret = self.parase_data(data_str)
    89         self.write_file(ret)
    90 
    91 if __name__ == '__main__':
    92     ChinaHRSpider().run()
    将取出的 数据 放到一个字典中, 最后将字典放到列表中,最后保存成Json文件!
     1 import requests
     2 import random
     3 from lxml import etree
     4 import json
     5 
     6 USER_AGENT = [
     7     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
     8     # IPhone
     9     "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    10     # IPod
    11     "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    12     # IPAD
    13     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
    14     "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    15     # Android
    16     "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    17     "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    18     # QQ浏览器 Android版本
    19     "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    20     # Android Opera Mobile
    21     "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    22     # Android Pad Moto Xoom
    23     "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    24     # BlackBerry
    25     "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    26     # WebOS HP Touchpad
    27     "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    28     # Nokia N97
    29     "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    30     # Windows Phone Mango
    31     "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    32     # UC浏览器
    33     "UCWEB7.0.2.37/28/999",
    34     "NOKIA5700/ UCWEB7.0.2.37/28/999",
    35     # UCOpenwave
    36     "Openwave/ UCWEB7.0.2.37/28/999",
    37     # UC Opera
    38     "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
    39 ]
    40 #https://search.chinahr.com/bj/job/pn9//?key=python
    41 class ChinaHRSpider():
    42     def __init__(self):
    43         self.base_url = "https://search.chinahr.com/bj/job/pn"
    44         self.headers = {"User-Agent":random.choice(USER_AGENT)}
    45         self.job_list = []
    46 
    47     #1,发送请求
    48     def send_requests(self,new_url,params):
    49         response =  requests.get(new_url,headers =self.headers,params=params)
    50         data_str = response.content.decode("utf8")  #默认就是utf8
    51         return data_str
    52 
    53     #2,解析数据
    54     def parase_data(self,data_str):
    55         # a 转换类型
    56         html_data = etree.HTML(data_str)
    57         # b 调用xpath 方法  解析数据
    58         divs = html_data.xpath('//div[@class="job-list-box"]/div')  # divs 包含了我们需要的一行中的信息
    59 
    60         for div in divs:
    61             temp = {}
    62             job_name = str(div.xpath('ul/li[@class="job-name"]/@title')[0])  # string
    63             fabu_date = div.xpath('ul/li[@class="fabu-date"]/text()')[0].strip("
     ")  # string
    64             job_address = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[0]  # string
    65             jingyan = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[1]  # string
    66             xueli = div.xpath('ul/li[@class="job-address"]/text()')[0].strip("
     ").split('|')[2]  # string
    67             company_address = div.xpath('ul/li[@class="job-company"]/text()')[0].strip("
     ")  # string
    68             data = {
    69                 "job_name": job_name,
    70                 "fabu_date": fabu_date,
    71                 "job_address": job_address,
    72                 "jingyan": jingyan,
    73                 "xueli": xueli,
    74                 "company_name": company_address
    75             }
    76             self.job_list.append(data)
    77 
    78     #3,保存数据
    79     def write_file(self):
    80         with open("ChinaHR.json","w",encoding="utf8") as f:
    81             json.dump(self.job_list,f,ensure_ascii=False)
    82     #4,调度方法
    83     def run(self):
    84         params = {
    85             "key":"python"
    86         }
    87         for pageNum in range(1,10): #1-9  页 的数据!
    88             new_url = self.base_url+str(pageNum)+"/"
    89             data_str = self.send_requests(new_url,params)
    90             self.parase_data(data_str)
    91             self.write_file()
    92 
    93             print("正在下载第{}页......".format(pageNum))
    94 
    95 if __name__ == '__main__':
    96     ChinaHRSpider().run()
    爬取多页之第一种:固定循环次数!

    bs4 

  • 相关阅读:
    怎样装两个MySQL服务器
    MySQL 8.0.12的安装与卸载
    位运算符2
    位运算符
    赋值运算符
    love心形
    变量之间运算
    变量
    标识符
    算术运算符
  • 原文地址:https://www.cnblogs.com/zach0812/p/12005571.html
Copyright © 2011-2022 走看看