warc.open()
warc.WARCFile()
, warc.WARCFile()
fileobj
, sys.stdin
. - :
import sys
import warc
f = warc.open(fileobj=sys.stdin)
for record in f:
print record['WARC-Target-URI'], record['Content-Length']
hadoop , - .gz
, hadoop \r\n
WARC \n
, WARC (. : hadoop \r\n \n ARC). warc
"WARC/(\d+.\d+)\r\n"
( \r\n
), , , :
IOError: Bad version line: 'WARC/1.0\n'
, PipeMapper.java
, , , WARC.
, warc.py
\n
\r\n
, Content-Length
. , hadoop, Content-Length
, , :
IOError: Expected '\n', found 'abc\n'