爬取新浪财经个股的历史财报摘要
网页的内容为:
想要的内容为:
- pd.DataFrame数据结构
- 方便查看数据和绘图
print df.tail()
print df.columns
print df.index.name
fig,(ax1,ax2)=plt.subplots(2,1)
# fig.set_figheight(fig.get_figheight()*2)
df.ix[:,(0,3)].plot(ax=ax1)
df.ix[:,5:7].plot(ax=ax2)
ax1.set_ylabel(u'(元)')
ax2.set_ylabel(u'(百万元)')
每股净资产 每股收益 每股现金含量 每股资本公积金 固定资产合计 流动资产合计
南宁糖业(000911)项目:财务摘要
2015-09-30 4.9220 0.1225 -1.8303 4.4944 NaN 3147.74
2015-12-31 4.9146 0.1847 0.5290 4.4944 NaN 3032.40
2016-03-31 4.5619 -0.3527 -3.1519 4.4944 NaN 3868.23
2016-06-30 4.2956 -0.6190 -3.9426 4.4944 NaN 3522.59
2016-09-30 4.1173 -0.7973 -2.4654 4.4944 NaN 3194.63
资产总计 长期负债合计 主营业务收入 财务费用 净利润
南宁糖业(000911)项目:财务摘要
2015-09-30 4730.00 695.012 2208.240 104.7310 39.7108
2015-12-31 5669.74 732.949 3138.420 143.6370 59.8534
2016-03-31 6565.34 1105.460 515.594 39.3776 -114.2920
2016-06-30 6086.67 1033.750 920.286 90.8180 -200.6000
2016-09-30 6021.67 1055.570 1624.810 135.9120 -258.3930
Index([u'每股净资产', u'每股收益', u'每股现金含量', u'每股资本公积金', u'固定资产合计', u'流动资产合计', u'资产总计',
u'长期负债合计', u'主营业务收入', u'财务费用', u'净利润'],
dtype='object')
南宁糖业(000911)项目:财务摘要
Out[403]: <matplotlib.text.Text at 0xdde1670>
matplotlib绘图:
代码:
def get_gg_fin_abs(code='000911'):
u'''
Note
-----
- xpath表达式: 'td[1]' 表示第一个td标签, 其中的'[n]': 声明第几个标签, 1-based
- etree对象操作: html的上层文字的获取:
用 .text属性 比 .xpath('text()')[0]方法 简明得多,
前提是: etree.element要具有text属性
- .find(_path) and .findall(_path) 方法也很好用,
他们分别返回一个etree._Element对象, 或者etree._Element对象的list
- <tbody> tag in <table> is usually added by the browser, not actually
in the html source. so you can not .find() or .xpath() it.
Use it's parent tag which is <table> to work with.
Ref
-----
- Get all td content inside tr of tbody in python using lxml - Stack Overflow
- http://stackoverflow.com/questions/37080910/get-all-td-content-inside-tbody-of-tr-in-python-using-lxml
'''
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vFD_FinanceSummary/stockid/%s.html'
url = url%(code)
craw=crawler.Crawler(url)
craw.idom()
tr_path='//table[@id="FundHoldSharesTable"]//tr'
trs = craw.dom.xpath(tr_path)
print len(trs)
# -------- 提取日期行的注意事项 ---------
# etree.tostring()方法可以查看: html文本
# print etree.tostring( trs[53].xpath('td')[1])
# <td align="left" class="tdr"><strong>2015-09-30</strong></td>
# 得知: 该文本是加粗的文本: 位于<strong>路径下面: 所以:需要带上/strong后缀
# k_date = tr_nodes[53].xpath('td[1]/strong').text
# v_date = tr_nodes[53].xpath('td[2]/strong').text
#txt = 'text()'
#txts = 'strong/text()'
from collections import OrderedDict as Odict
fdata= Odict()
#for i,tr in enumerate(trs[53:65+20]):
# //*[@id="FundHoldSharesTable"]/tbody/tr[1] :
# paste this xpath from copy of Chrome F12 of 审查元素
#
# handle header: ------------------------------
#
header= trs[0].find('th').text
header=header.strip()
# ------ handle body data -------
for i,tr in enumerate(trs[1:]): # 从第2个tr的数据行开始
if tr.xpath('td[@height="5px"]') != []: # empty row
#print 'this is a empty row'
continue
if tr.find('td[1]').find('strong') is not None: # 截止日期行
vdate = tr.find('td[2]/strong').text # 找到第二个td的文本
dict2 = Odict() # 创建一个空的字典, for 本季度的财务摘要数据
continue
else:
k2= tr.find('td[1]').text
#==============================================================================
# # if tr.find('td[2]/a').text is not None: # 千万不要带属性测试, 因为太贪婪
# if tr.find('td[2]/a') is not None:
# v2= tr.find('td[2]/a').text
# else:
# v2= tr.find('td[2]').text
#==============================================================================
# 可以更简明地编写为:
v2= tr.find('td[2]/a').text if tr.find('td[2]/a') is not None else
tr.find('td[2]').text
dict2[k2]=v2
# print i,k2,v2 # for debug purpose
if k2==u'净利润': # 本季度的最后一行数据, 需要保存小字典到大字典
fdata[vdate] = dict2
continue
df=pd.DataFrame(fdata.values(), index=fdata.keys())
df.index.name=header
#print df.head().ix[:,:2]
#print df #//*[@id="FundHoldSharesTable"]/tbody/tr[870]
def mapper_strdatetime10_2_datetime(s):
u'''
para
-----
- s, str, '1998-12-31'
return
-----
- datetime.datetime(1998, 12, 31, 0, 0)
'''
y= int(s[0:4])
m= int(s[5:7])
d= int(s[8:10])
return datetime(y,m,d)
def mapper_html_table_td_2_float(td):
if td.strip() is not u'':
td = td.strip().replace(u'元','')
td = float(td)
else: td = np.nan
return td
def mapper_html_table_td_with_comma_2_float(td):
if td.strip() is not u'':
td = td.strip().replace(u'元','')
if td.find(',')>0:
td = td.replace(',','')
td = float(td)/1000000.0
return td
td = float(td)/1000000.0
return td
else:
td = np.nan
return td
df.index = map(mapper_strdatetime10_2_datetime, df.index)
df.index.name = header
for i in np.arange(4): #len(df.columns)):
df.ix[:,i] = map(mapper_html_table_td_2_float, df.ix[:,i])
for i in np.arange(4, len(df.columns)): #len(df.columns)):
df.ix[:,i] = map(mapper_html_table_td_with_comma_2_float, df.ix[:,i])
# df.ix[:, :2].plot()
return df.sort_index()
#==============================================================================
# print df.tail()
# print df.columns
# print df.index.name
#
# fig,(ax1,ax2)=plt.subplots(2,1)
# # fig.set_figheight(fig.get_figheight()*2)
# df.ix[:,(0,3)].plot(ax=ax1)
# df.ix[:,5:7].plot(ax=ax2)
# ax1.set_ylabel(u'(元)')
# ax2.set_ylabel(u'(百万元)')
#==============================================================================