zoukankan      html  css  js  c++  java
  • 爬取英文名详细内容

    csv存储使用html_save(s)函数

    图片存储使用pic_save(url,name)函数

    爬取时首先爬取首页所有英文名详细内容的链接并存入列表,然后将列表中的链接依次爬取,并调用存储函数存储价值数据。

     1 import sys
     2 import io
     3 import re
     4 sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
     5 import requests
     6 from bs4 import BeautifulSoup
     7 from urllib import request
     8 
     9 def html_save(s):
    10     with open('Name.csv','a',encoding='gb18030')as f:
    11         f.write(s+'
    ')
    12 def pic_save(url,name):
    13     root="C://Users//L//Desktop//ba//"
    14     # name=url.split('=')[-1]
    15     path=root+name+'.jpg'
    16     r=requests.get(url)
    17     with open(path,'wb')as f:
    18         f.write(r.content)
    19         f.close()
    20         # print('ok')
    21 # soup = BeautifulSoup(html,'index')
    22 def getName_link():
    23     lst=[]
    24     url='http://www.babynology.com/baby-boy-names.html'
    25     r=requests.get(url)
    26     soup= BeautifulSoup(r.text,'html.parser')
    27     # soup = BeautifulSoup(open('Girl.html'))
    28     for div in soup.find_all('div',{'class':'babynology_textevidence babynology_bg_grey babynology_shadow babynology_radius left overflow_scroll'}):
    29         for strong in div.find_all('strong'):
    30             # print(strong.find_all('a')[0].text.replace('    ','').replace(' ','').replace('
    ',''))
    31             # print(strong.find_all('a')[0].get('href').replace('
    ',''))
    32             i=strong.find_all('a')[0].text.replace('    ','').replace(' ','').replace('
    ','')
    33             j=strong.find_all('a')[0].get('href').replace('
    ','')
    34             lst.append(j)
    35             # html_save(i)
    36             # html_save(j)
    37     # # print(lst)        
    38     return lst
    39 
    40 def hh(lst):
    41     for i in lst:
    42         url=i
    43     # url='http://www.babynology.com/name/bahula-m.html'
    44         r=requests.get(url)
    45         soup= BeautifulSoup(r.text,'html.parser')
    46         name=soup.find('h2',{'class':'txtclrm name-head2'}).text
    47         print("Name:",name)
    48         # print(soup)
    49         #gender=soup.find('div',{'class':'grid grid_8'})#.find('div',{'class':'babynology_textevidence babynology_width_percentage40 babynology_width100_responsive'})
    50         gender=soup.find('h5',{'style':'color:#000;'}).text
    51         print("Gender:",gender)
    52         # Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).stripped_strings
    53         # font=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('font').text
    54         # print(type(Numerology))
    55         # Numerology=str(Numerology)
    56         Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).text.replace('   ','').replace('
    ','').replace('    ','')
    57         a=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('span').text.replace('   ','').replace('
    ','').replace('    ','')
    58         b=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('script').text.replace('   ','').replace('
    ','').replace('    ','')
    59         n=Numerology.strip(b).strip(a)
    60         # print(name,'Numerology:',font,"%s"%list(Numerology)[1].replace('
    ','').replace('    ','').replace('      ',''))
    61         print(name,'Numerology:',Numerology.strip(b).strip(a))
    62         n=Numerology.strip(b).strip(a)
    63         n=name+' Numerology:'+n
    64         n=n.replace('',' ').replace(',',' ')
    65         # n=n.encode('UTF-8','ignore').decode('UTF-8')
    66         print(n)
    67         # url='http://www.babynology.com/name/bahula-m.html'
    68         r=requests.get(url)
    69         pic=soup.find('img',{'style':'margin-left:-10px; margin-top:-5px;'}).get('src')
    70         # print(pic)
    71         html_save('Name:'+name)
    72         html_save('Gender:'+gender)
    73         html_save(n)
    74         pic_save(pic,name)
    75         # html_save('--------------------------------------------------------------------------------------------------------------------------')
    76         print('---------------------------------------------------------------------------')
    77         # print(name,'Numerology:',Numerology.strip(b).strip(a))
    78 hh(getName_link())
  • 相关阅读:
    python的变量,对象的内存地址以及参数传递过程
    win10环境pycharm社区版创建django项目
    组合,菱形继承,子类重用父类2,深度广度查找
    类内的函数共享给对象使用
    模块与面向对象初解
    正则模块,sys模块
    包介绍,与日记模块
    模块运用,文件搜索
    递归,匿名函数
    生成器与简写
  • 原文地址:https://www.cnblogs.com/huanghuangwei/p/12077452.html
Copyright © 2011-2022 走看看