pyquery标签选择
获取了所有的img标签(css选择器,你也可以换成不同的class和id)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import requests 2 import re 3 from pyquery import PyQuery as pq 4 headers={ 5 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 6 "Accept-Encoding": "gzip, deflate", 7 "Accept-Language": "zh-CN,zh;q=0.9", 8 "Upgrade-Insecure-Requests": "1", 9 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" 10 } 11 response=requests.get('https://www.zhihu.com/question/35239964/answer/66644148',headers=headers,timeout=9) 12 doc=pq(response.content) 13 #css选择器 14 a=doc('img')#<class 'pyquery.pyquery.PyQuery'> 15 print(a)
url初始化(通过访问url得到html代码)
有了pyquery,你甚至不需要再使用requests来get网页
from pyquery import PyQuery as pq headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" } doc=pq(url='https://www.zhihu.com/question/35239964/answer/66644148',headers=headers)#直接初始化url得到源代码 print(doc('title').text())
文件初始化(通过文件得到html代码)
#文件初始化 from pyquery import PyQuery as pq import requests # 写文件 # headers={ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", # "Accept-Encoding": "gzip, deflate", # "Accept-Language": "zh-CN,zh;q=0.9", # "Upgrade-Insecure-Requests": "1", # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" # } # result=requests.get('https://www.zhihu.com/question/35239964/answer/66644148',headers=headers) # # result=pq(url='https://www.zhihu.com/question/35239964/answer/66644148',headers=headers) # print(type(result)) # with open("zhihu.html",'wb')as f: # f.write(result.content) #读文件 doc=pq(filename='test.html') print(doc('title'))
css选择器
doc('#content .list li')#得到的是所有的符合这种层级关系的li
查找元素
子元素
find(查内部的元素)
html=‘.....’ doc=py(html) a=doc('.content') b=a.find('img')查找.content内的img标签
children(和find一样)
html=‘.....’ doc=py(html) a=doc('.content') b=a.children('img')查找.content内的img标签
父元素
html=‘.....’ doc=py(html) a=doc('.content') b=a.parent()查找.content的父元素整体
html=‘.....’ doc=py(html) a=doc('.content') b=a.parents()遍历输出.content的所有祖先元素整体
当然也可以加上css选择器
html=‘.....’ doc=py(html) a=doc('.content') b=a.parents(‘.wrap’)查找.content的祖先节点中为.wrap的标签
兄弟元素
html=‘.....’ doc=py(html) a=doc('.content') b=a.siblings()#查找.content的同级标签,也可以加css选择器
遍历查找的元素
from pyquery import PyQuery as pq headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36" } doc=pq(url='https://www.zhihu.com/question/35239964/answer/66644148',headers=headers) a=doc('img').items()#得到可迭代对象 for i in a: print(i)
<img src="data:image/svg+xml;utf8,<svg%20xmlns='http://www.w3.org/2000/svg'%20width='503'%20height='410'></svg>" data-rawwidth="503" data-rawheight="410" class="origin_image zh-lightbox-thumb lazy" width="503" data-original="https://pic4.zhimg.com/d8d9743a16e6db3f36b19a3397469b1d_r.jpg" data-actualsrc="https://pic4.zhimg.com/50/d8d9743a16e6db3f36b19a3397469b1d_hd.jpg"/>
加入i是得到的上述的html元素
获取属性:jpgurl=i.attr('data-original')
获取文本:text=i.text()
获取HTML:html=i.html()//获取i里面的html元素
DOM操作
addclass(添加class)、removeclass(移除class)
attr: .attr('name':'userid')//添加或替换name属性
css: .css('height':'500px')//添加或替换style
remove:
http=''' <div class='wrap'> helloworld <p>this is p</p> </div>''' from pyquery import PyQuery as pq doc=pq(http) wrap=doc('.wrap') print(wrap.text())#helloworld this is p wrap.find('p').remove() print(wrap.text())#helloworld