目标,豆瓣读书,
下载页面书籍图片。
import urllib.request import re #使用正则表达式 def getJpg(date): jpgList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',date) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl, 'C:\Users\74172\source\repos\Python\spidertest1\images\book.douban\%s.jpg' %sTitle) except Exception as e: print(e) finally: print('图片%s下载操作完成' % n) def getTitle(date): titleList = re.findall(r'title=".">',date) return titleList if __name__ == '__main__': url = 'https://book.douban.com/' res = urllib.request.urlopen(url) date = res.read().decode('utf-8') date_jpg = getJpg(date) imageTitle = getTitle(date) global n n = 1 for jpginfo in date_jpg: s = re.findall(r'http.+?.jpg',str(jpginfo)) print(n,'--- url -->',str(s)[2:-2]) sTitleInfo = re.findall(r'alt=".+?."',str(jpginfo)) sTitleL = re.findall(r'".+?."',str(sTitleInfo)) sTitle = str(sTitleL)[3:-3] downLoad(s[0],sTitle,n) n = n + 1
又做了点修改,并将书名写入txt文件中
import urllib.request import re #使用正则表达式 def getJpg(html): jpgList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',html) jpgList = re.findall(r'http.+?.jpg',str(jpgList)) return jpgList def downLoad(jpgUrl,sTitle,n): try: urllib.request.urlretrieve(jpgUrl, 'C:/Users/74172/source/repos/Python/spidertest1/images/book.douban/%s.jpg' %sTitle) finally: print('图片---%s----下载操作完成' % sTitle) def getTitle(html): titleList = re.findall(r'(img src="http.+?.jpg")([sS]*?)(.+?.alt=".+?.")',html) titleList = re.findall(r'alt=".+?."',str(titleList)) titleList = re.findall(r'".+?."',str(titleList)) return titleList def writeTxt(imageTitle): try: #目录建立txt文件 f = open((url[8:-5]+'.txt'),"a",encoding="utf-8") #写入 f.write(imageTitle+' ') finally: if f: #关闭文件 f.close() if __name__ == '__main__': url = 'https://book.douban.com/' res = urllib.request.urlopen(url) html = res.read().decode('utf-8') urlJpgs = getJpg(html) imageTitle = getTitle(html) n = 0 for urlJpg in urlJpgs: print(n,'--- url -->',urlJpg) downLoad(urlJpg,imageTitle[n][1:-1],n) writeTxt(imageTitle[n][1:-1]) n = n + 1