[原创]在线视频下载(Using Python / Bash / C / Reguar Expressions)
Windows上下载在线视频不是很难, 可以安装爱酷等对应在线视频(这里是优酷)的官方下载工具, 更通用地, 可以使用硕鼠下载, 这个软件我没用过, 但我需要使用硕鼠官方网站http://www.flvcd.com(支持70多个在线视频网站的解析, 好强大的说)的视频解析作为代理将某个在线视频播放地址解析成对应的下载地址, 我使用Python和正则表达式进行抓取我想要的部分(下载地址以及视频标题)并且下载给定视频地址的视频, 这个脚本如下:
#!/usr/bin/env python
import sys
def output(s):
sys.stderr.write(s + "\n") # output progress to stderr(Notice that it is NOT for outputing actual error message,
# I use stderr just to catch stdout easily(the same as curl) so i can merge these flvs later)
argc = len(sys.argv)
if argc == 2:
format = 'super'
elif argc == 3:
format = sys.argv[2]
else:
output("Usage: %s videourl [videoquality=normal|high|super|...]" % sys.argv[0])
output(" e.g.");
output(" %s http://v.youku.com/v_show/id_XMzMzMjE0MjE2.html super" % sys.argv[0])
exit(1)
videourl = sys.argv[1];
import urllib2
import urllib
url = 'http://www.flvcd.com/parse.php?kw=' + urllib.quote(videourl) + '&format=' + format;
req = urllib2.Request(url);
# add some headers to fake Firefox Browser(if we don't do so, there will be a problem when try to get tudou video)
req.add_header('host', 'www.flvcd.com');
req.add_header('Referer', url[:-4]);
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0) Gecko/20100101 Firefox/4.0');
req.add_header('Accept-Language', 'en-us,en;q=0.5');
req.add_header('Accept-Encoding', 'gzip, deflate');
req.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7');
req.add_header('Keep-Alive', '115');
res = urllib2.urlopen(req);
html = res.read()
import re
pattern = re.compile('<input\s+type="hidden"\s+name="inf"\s+value="([^"]+)');
firstmatch = pattern.search(html);
urls = firstmatch.group(1);
urls = unicode(urls, 'gbk'); # urls turns out to be utf8 encoding
urlpattern = re.compile('<[NU]>(.+)');
result = urlpattern.findall(urls);
data = [result[i:i+2] for i in range(0, len(result), 2)]
count = len(data)
files = []
output('\n--- Start to download from url "%s" (%d block(s) in total):' % (videourl, count))
for k, v in enumerate(data):
output(' >downloading Block %.2d of %.2d ...' % (k+1, count))
urllib.urlretrieve(v[1], v[0] + '.flv')
files.append( (v[0] + '.flv').replace('"', '\\"').replace('$', '\$').encode('utf-8') )
output(' downloaded Block.%.2d completely<' % (k+1,))
output('--- finished ---\n')
print('"' + '" "'.join(files) + '"')
我们保存这个脚本到home(即~)目录下, 为dl.py, 即保存为~/dl.py, 操作如下
# 打开终端
vi ~/dl.py # press ENTER
# press i
# 粘贴上面的Python代码
# press ESC
:wq # press ENTER
sudo chmod u+x ~/dl.py # 使其可执行
这个脚本首先解析命令行参数然后提取在线视频地址和欲下载视频的质量,然后通过flvcd.com代理解析成若干个下载地址(优酷网站会将大视频分割成若干块,但像土豆等视频网站不会),接着使用正则提取视频标题和下载地址并下载,输出进度到stderr, 输出每块下载的视频文件文件名(这些文件名中特殊字符被转义接着被"包围并以空格分开)到stdout,程序添加的一些HTTP头是为了欺骗flvcd网站我们是通过Firefox浏览器访问的(但并没有完全欺骗, 似乎部分土豆在线视频的下载地址无法下载, to be solved)
输出空格分割的文件名是为了方便shell对stdout捕获然后使用下面的程序进行flv文件合并(主要针对youku, 前面说过, 优酷的大视频是分段的, 所以我写了下面的C程序合并这些分段的flv视频):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// #include <malloc.h> // uncomment this line if you are using windows
typedef unsigned char byte;
typedef struct {
byte type[3];
byte version;
byte typeFlag;
byte headerLength[4];
} FlvHeader;
typedef struct {
byte tagType;
byte dataSize[3];
byte timestamp[3];
byte timestamp_extension;
byte streamID[3];
// byte tagData[ dataSize ]
} FlvTag;
// determine whether current system is big endian(network, media, etc) or not, e.g. int 1 is stored as "00 00 00 01" in
// big endian machine, while "01 00 00 00" in little endian machine, if we assume that sizeof(int) = 4 and the left
// address is smaller than the next(right) one
int is_big_endian() {
int i = 0x1;
return *(char*)&i != '\x1';
}
// exit program with exit code exitStatus before printing msg to stderr
void quit(char* msg, int exitStatus) {
fprintf(stderr, "%s", msg);
exit(exitStatus);
}
// convert a raw integer which is read from FLV file to which endian current system fits
int intval(byte* bits, int size) {
int i, ret = 0;
if (bits == NULL || size < 1 || size > 4) {
quit("invalid bits(is NULL?) or size(out of [1,4]?) when calling intval\n", 1);
}
if (is_big_endian()) {
return *(int*)bits;
}
for (i = 0; i < size; i++)
ret = (int)bits[i] + (ret << 8);
return ret;
}
// convert an integer stored as which endian current system fits to raw in FLV file
byte* byteval(int value, int size) {
static byte bits[4] = {0};
byte* p = (byte*)&value;
int i;
if (size < 1 || size > 4) {
quit("invalid size(out of [1,4]?) when calling byteval\n", 1);
}
if (is_big_endian()) {
*(int*)bits= value;
} else {
for (i=0; i < 4; ++i)
bits[i] = p[3-i];
}
return bits + 4 - size;
}
// same as intval, just for double here
double doubleval(byte* bits) {
static byte reverse_bits[8] = {0};
int i;
if (bits == NULL)
quit("invalid bits(is NULL?)\n", 1);
if (is_big_endian()) {
*(double*)reverse_bits = *(double*)bits;
} else {
for(i = 0; i < 8; ++i)
reverse_bits[i] = bits[7 - i];
}
return *(double*)reverse_bits;
}
// same as byteval, just for double here
byte* bytevaldouble(double value) {
static byte bits[8] = {0};
byte* p = (byte*)&value;
int i;
if (is_big_endian()) {
*(double*)bits = value;
} else {
for (i = 0; i < 8; ++i)
bits[i] = p[7-i];
}
return bits;
}
// return header if successfully, otherwise return NULL
FlvHeader* flv_header_read(FILE* fp, FlvHeader* header) {
return fread(header, sizeof(FlvHeader), 1, fp) == 1 ? header : NULL;
}
// check if flv header is valid so that we can determine whether we need do merge
int flv_is_valid_header(FlvHeader* header) {
return header && header->type[0] == 'F' && header->type[1] == 'L' && header->type[2] == 'V'
&& ((header->typeFlag | 5) == 5);
}
// read an flv tag from file fp points and save tag [meta] to tag, tag data size to dataSize, previous tag size to
// previousSize, return pure tag data
// CAUTION: this function will reserve the last allocated memory by returning a pointer, so memery leak is produced
// but only ONE leak, you can free it simply by 'byte* data=flv_tag_read(fp,...); /* some operation */ free(data);'
// but REMEMBER NOT to call flv_tag_read again after the free operation unless you wanna get 'segment fault'-like error, etc.
byte* flv_tag_read(FILE* fp, FlvTag* tag, int* dataSize, int* previousSize) {
static byte* _tagData = NULL;
static int _dataSize = 0; // store the length of _tagData
int tagSize = 0, countread = fread(tag, sizeof(FlvTag), 1, fp);
if (countread != 1)
return NULL;
tagSize = intval(tag->dataSize, 3);
if (_tagData == NULL || _dataSize < tagSize) { // if _tagData is not allocated OR if _tagData is not enough, try to allocate for _tagData again
if(_tagData) // but should free the old _tagData before allocates memery to it
free(_tagData);
_tagData = (byte*)malloc(tagSize * sizeof(byte));
}
if (fread(_tagData, sizeof(byte), tagSize, fp) != tagSize ||
fread(previousSize, sizeof(int), 1, fp) != 1 ) {
quit("FLV tag data(broken tag data or broken previous size?) is broken.\n", 1);
}
*dataSize = _dataSize = tagSize;
*previousSize = *(int*)byteval(*previousSize, 4);
return _tagData;
}
// use the most stupid searching algorithm to search binary data search in binary data data
// return index in data if found search, otherwise return -1
int stupid_byte_indexof(byte* search, int searchLength, byte* data, int dataSize) {
int i, j, end = dataSize - searchLength, found;
if (search == NULL || data == NULL || end < 0 || searchLength < 1)
quit("invalid arguments when searching", 1);
for (i=0; i < end; ++i) {
found = 1;
for(j=0; j < searchLength; ++j)
if (data[j] != search[j]) {
found = 0;
break;
}
if (found)
return i;
data++;
}
return -1;
}
// strip keyframes data in script data tag and rewrite the hasKeyframes to false
byte* flv_scriptdata_strip_keyframes(FlvTag* tag, byte* scripttagData, int* dataSize) {
byte hasKeyframes[] = {'h', 'a', 's', 'K', 'e', 'y', 'f', 'r', 'a', 'm', 'e', 's', '\x1'};
byte keyframes[] = {'\x0', '\x9', 'k', 'e', 'y', 'f', 'r', 'a', 'm', 'e', 's', '\x3'};
byte* ds = NULL;
int len = sizeof(hasKeyframes)/sizeof(byte);
int index;
if (! tag || tag->tagType != 0x12 || ! scripttagData || ! dataSize) {
quit("can't strip non-scriptdata's[null or video/audio tag data?] keyframes or null pointer", 1);
}
index = stupid_byte_indexof(hasKeyframes, len, scripttagData, *dataSize - 1);
if (index != -1)
scripttagData[index + len] = '\x0';
index = stupid_byte_indexof(keyframes, sizeof(keyframes)/sizeof(byte), scripttagData, *dataSize);
if (index != -1) {
*dataSize = index;
ds = byteval(index, 3);
tag->dataSize[0] = ds[0];
tag->dataSize[1] = ds[1];
tag->dataSize[2] = ds[2];
}
return scripttagData;
}
// write an flv tag to file fp points with tag [meta] tag, tag data tagData and previous tag size previousSize
// return bytes written if successful, otherwise return 0
int flv_tag_write(FILE* fp, FlvTag* tag, byte* tagData, int* dataSize, int* previousSize) {
if (
fwrite(tag, sizeof(FlvTag), 1, fp) != 1 ||
fwrite(tagData, sizeof(byte), *dataSize, fp) != *dataSize ||
fwrite(previousSize, sizeof(int), 1, fp) != 1
) {
return 0;
}
return sizeof(FlvTag) + *dataSize * sizeof(byte) + sizeof(int);
}
// get duration from an flv SCRIPT tag data(pure data) and save duration index where we found
// duration in FLV file to offset if offset is not NULL
double flv_tag_get_duration(byte* tagData, int dataSize, int* offset) {
// make sure tag is script tag, that is: tag.tagType == 0x12
byte search[9] = { 'd', 'u', 'r', 'a', 't', 'i', 'o', 'n', '\0' };
int index = stupid_byte_indexof(search, 9, tagData, dataSize);
if (index == -1) {
quit("Sorry, can't get flv meta duration.", 1);
}
index += sizeof(search)/sizeof(byte);
if (offset)
*offset = index;
return doubleval(tagData + index);
}
// get timestamp from an flv tag [meta]
int flv_tag_get_timestamp(FlvTag* tag) {
if (! tag)
return -1;
return ((int)(tag->timestamp_extension) << 24) + intval(tag->timestamp, 3);
}
// set timestamp to an flv tag [meta]
int flv_tag_set_timestamp(FlvTag* tag, int timestamp) {
if (! tag || timestamp < 0)
return -1;
tag->timestamp_extension = timestamp >> 24;
memcpy(tag->timestamp, byteval(timestamp & 0x00FFFFFF, 3), 3);
return timestamp;
}
int main(int argc, char* argv[]) {
FlvHeader header;
FlvTag tag;
byte* tagData;
FILE *fpdst = NULL, *fpsrc = NULL;
int i = 0, srccount = argc - 2, headerLength, duration_index = 0,
prevSize, dataSize, offset, foundduration = 0, zero = 0, basetimestamp[2], lasttimestamp[2] = {0};
char** src = argv + 2;
double duration = 0.0;
int bts = 0;
if (argc < 2) {
fprintf(stderr, "Usage: %s flvtobesaved 1stflv [2ndflv [3rdflv [...]]]\n", argv[0]);
exit(1);
}
if ((fpdst = fopen(argv[1], "wb")) == NULL) {
fprintf(stderr, "Can't write to file '%s'\n", argv[1]);
exit(1);
}
while (i < srccount) {
if ((fpsrc = fopen(src[i], "rb")) == NULL) {
fprintf(stderr, "Can't open file '%s'\n", src[i]);
exit(1);
}
if(! flv_header_read(fpsrc, &header) || ! flv_is_valid_header(&header)) {
fprintf(stderr, "The header of file '%s' is broken or is not FLV header.\n", src[i]);
exit(1);
}
if (i == 0) {
fwrite(&header, sizeof(FlvHeader), 1, fpdst);
fwrite(&zero, sizeof(int), 1, fpdst); // the first previous tag size is 0
duration_index = sizeof(FlvHeader);
}
headerLength = intval(header.headerLength, 4);
if (0 != fseek(fpsrc, headerLength+4, SEEK_SET)) { // skip to real flv tag data(skip the first previous tag size, +4)
fprintf(stderr, "The first previousSize(should be 0) of file '%s' is broken.\n", src[i]);
exit(1);
}
bts = (int)(duration * 1000);
basetimestamp[0] = lasttimestamp[0];
basetimestamp[1] = lasttimestamp[1];
if (bts < basetimestamp[0])
bts = basetimestamp[0];
if (bts < basetimestamp[1])
bts = basetimestamp[1];
foundduration = 0;
while (tagData = flv_tag_read(fpsrc, &tag, &dataSize, &prevSize)) {
if (tag.tagType == 0x12 && ! foundduration) { // if script data and duration not found, try to get duration
duration += flv_tag_get_duration(tagData, dataSize, &offset);
foundduration = 1;
if (i == 0) { // prepare the script data for writing, we choose the first FLV file header as sample
duration_index += 4 + sizeof(FlvTag) + offset;
flv_scriptdata_strip_keyframes(&tag, tagData, &dataSize);
flv_tag_write(fpdst, &tag, tagData, &dataSize, &prevSize);
}
} else if (tag.tagType == 0x8 || tag.tagType == 0x9) {
lasttimestamp[tag.tagType - 0x8] = bts + flv_tag_get_timestamp(&tag);
flv_tag_set_timestamp(&tag, lasttimestamp[tag.tagType - 0x8]);
flv_tag_write(fpdst, &tag, tagData, &dataSize, &prevSize);
if (i == 0 && ! foundduration) {
duration_index += 4 + sizeof(FlvTag) + dataSize;
}
}
}
//fprintf(stdout, "base: %d, last: %d\n", basetimestamp[0], lasttimestamp[0]);
printf("completely merging file '%s' to '%s'\n", src[i], argv[1]);
fclose(fpsrc);
++i;
}
if (0 != fseek(fpdst, duration_index, SEEK_SET))
quit("can't seek to duration\n", 1);
fwrite(bytevaldouble(duration), 1, 8, fpdst); // save real duration to file
fclose(fpdst);
return 0;
}
我们保存为~/flvmerge.c(方法同上), 然后编译并链接成可执行程序:
# 打开终端, 如已经打开可以忽略
vi ~/flvmerge.c # press ENTER
# press i
# 粘贴上面的C代码
# press ESC
:wq # press ENTER
gcc -o ~/flvmerge ~/flvmerge.c
我不期望读者能够看懂这个C程序, 因为牵涉到FLV文件结构和C语言指针的知识, 而我在写这个程序时也是遇到很多困难: 没有改变后续分段的timestamp, 各分段的起始timestamp不一致导致声音视频不同步, keyframes不完整(这个在高级的播放器播放没有问题,但对于很傻的flv播放器比如优酷flv播放器的话,它就无法手动快进到keyframes之后的timestamp, 所以我正考虑要不要修复keyframes, 但我用FLV MetaData Injector修复scriptdatatag损坏的flv视频发现, 它猥琐地删除了keyframes, 所以我也...然后快进快退都ok了), 但这个合并程序局限在合并的各个flv视频的scriptdatatag必须对duration(即视频时长)有定义而且不可能能够应对相当多的情况(其实也不是不可能, 但需要看flash文档), 我正在抽时间解决这个问题, 很幸运地, 优酷的各个分段的scriptdatatag都挺完整的。另外这个程序在小端机器上编译通过测试没问题, 程序里我提供了对大端机器的支持, 但可惜没有环境测试。
接下来是连接这两个程序的桥梁: -- bash 脚本上场了
#!/bin/bash
if [ "$#" -ne 1 -a "$#" -ne 2 ]
then
echo "Usage: $0 videourl [videoquality=normal|high|super|...]"
echo ' e.g.'
echo " $0 http://v.youku.com/v_show/id_XMzMzMjE0MjE2.html super"
exit 1
fi
files=$(~/dl.py "$1" "$2")
eval set "$files"
if [ "$#" -gt 1 ]
then
mergedfile=$(echo "$1" | sed 's/[0-9]*.flv$/_merged.flv/')
echo "starting merging files to file '$mergedfile'"
eval ~/flvmerge "$mergedfile" "$files"
echo "--- completely merging files to file '$mergedfile'"
echo
fi
我们保存为~/g(方法同上):
# 打开终端, 如已经打开可以忽略
vi ~/g # press ENTER
# press i
# 粘贴上面的bash代码
# press ESC
:wq # press ENTER
sudo chmod u+x ~/g # 使其可执行
这个程序尝试捕获~/dl.py的输出然后如果有必要(不止一个分段时)才将各个分段视频利用~/flvmerge合并成一个大FLV文件
好了, 一切准备就绪, 你可以在终端里输入~/g并回车执行, 可以看到命令使用帮助, 而且还给出了一个下载示例, 默认地下载到当前的文件夹, 即执行命令pwd的结果对应的目录
各个命令的使用方法如下
~/dl.py 在线视频地址 视频质量 # 下载指定地址指定质量的视频, 对于优酷可能会有很多分段
~/flvmerge 要合并成的文件 第一段flv视频 第二段flv视频 ... # 将后续的 "第一段flv视频" "第二段flv视频" ... 等FLV视频合并为 "要合并成的文件"
~/g 在线视频地址 视频质量 # 下载指定地址指定质量的视频, 如果有多个分段并且各个分段都是flv格式的, 那么将这些分段合并成去除后续的分段号并添加_merged.flv的文件名的视频 例如 "音乐银行001.flv" 将变成 "音乐银行_merged.flv", 不要害怕使用这个程序, 它仅仅会写进第一个参数即"要合并成的文件"而不会改变后面分段视频的, 所以你要是想看分段视频的你可以直接看分段视频的, 程序没有修改它们, 也没有删除它们, 所以即便合并出错(比如mp4格式的), 也不要怕, 可以看分段视频
** 关于视频质量:由flvcd的format参数决定, 对于优酷: normal标清, high高清, super超清, 其他的请自行实验, 另外如果选择优酷的高清模式下载的话, 那么这些文件将无法被合并, 因为优酷的高清视频是mp4格式的, 而mp4的合并尚待研究
可以这样试一试:
~/g http://v.youku.com/v_show/id_XMzMzMjE0MjE2.html super # 这将下载优酷超清模式的"音乐银行 111216"并且下载完后将保存为"音乐银行E633111216-_merged.flv", 文件位置在你执行上个命令的位置, 你可以紧接着ls *.flv查看下载的文件
读者可能不太了解使用命令下载的好处, 你将看到当~/g结合curl(Mac OS X)/wget(Ubuntu)+正则等抓取优酷专辑或优酷空间视频实行批量下载的威力, 如果网络带宽允许你可以尝试: curl -s 'http://music.youku.com/' | grep -oP 'http://v.youku.com/v_show/id_\w+.html' | xargs -L 1 ~/g # 这将下载优酷音乐频道页面的所有音乐视频并合并所有相同地址不同分段的视频
这个是我以前的博客地址: http://blog.csdn.net/Wind__Fantasy, 曾介绍过下载郑云视频全集的bash脚本(PS: 可惜我注册邮箱忘了, 密码也忘了, 所以这次必须得彻底转到博客园来了, csdn上的广告太多了, 实在受不了了。)