# -*- coding:utf-8 -*-
import urllib2
import lxml.html
from lxml import etree
def main():
file = open('./countrys.txt', 'w+')
file.close()
countrys = []
url = 'https://guojiadiqu.51240.com/'
html = urllib2.urlopen(url).read()
# tree = lxml.html.fromstring(html)
# td = tree.cssselect('div#main_content > ul.list > li > a > @href')[0]
selector = etree.HTML(html)
uls = selector.xpath('//div[@id="main_content"]/ul')
for ul in uls:
lis = ul.xpath('./li')
for li in lis:
country_infos = {}
key = li.xpath('./a/text()')[0]
value = 'https://guojiadiqu.51240.com' + li.xpath('./a/@href')[0].strip()
country_infos[key] = value
countrys.append(country_infos)
return get_capital(countrys)
def get_capital(list):
i = 0
for country in list:
i += 1
name = country.keys()[0]
url = country.values()[0]
html = urllib2.urlopen(url).read()
tree = etree.HTML(html)
tr = tree.xpath('//div[@id="main_content"]/table')[0]
tr1 = tr.xpath('./tr/td/table/tr')
tr2 = tr1[2].xpath('./td/text()')
file = open('./countrys.txt', 'a')
if len(tr2) > 0:
content = str(i) + ' ' + name + '
' + tr2[0] + '
'
else:
content = str(i) + ' ' + name + '
' + '
'
file.write(content.encode('utf-8'))
file.close()
if __name__ == "__main__":
main()