On the fileman page :
The print type usually contains one of the words text (the file contains only printed characters and several common control characters and is probably safe to read on the ASCII terminal), the executable file (the file contains the result of compiling the program in a form that is understandable for some UNIX kernel or another), or data meaning anything else (data is usually “binary” or not printable).
Seeing that you just want to determine if it is text or binary, I would just check if every character in the stream is accessible
import string
all(c in string.printable for c in stream)
, - 100% , . unicode?
EDIT. Unicode , , , , ,
import string
import unicodedata
encodings = 'ascii', 'utf-8', 'utf-16'
test_strings = '\xf0\x01\x01\x00\x44', 'this is a test', 'a utf-8 test \xe2\x98\x83'
def attempt_decode(s, encodings):
for enc in encodings:
try:
return s.decode(enc), enc
except UnicodeDecodeError:
pass
return s, 'binary'
def printable(s):
if isinstance(s, unicode):
return not any(unicodedata.category(c) in ['Cc'] for c in s)
return all(c in string.printable for c in s)
for s in test_strings:
result, enc = attempt_decode(s, encodings)
if enc != 'binary':
if not printable(result):
result, enc = s, 'binary'
print enc + ' - ' + repr(result)
:
binary - '\xf0\x01\x01\x00D'
ascii - u'this is a test'
utf-8 - u'a utf-8 test \u2603'