ファイル種別の判定方法
拡張子だけでファイル種別を判別できるのであれば以下でOK
>>> from mimetypes import guess_type
>>> guess_type("test.csv")
('application/vnd.openxmlformats-officedocument.wordprocessingml.document',
None)
しかし拡張子が信用できない場合や、拡張子のないファイルの判別はできない。
ファイルの中身で判定する必要がある。
PyPIにそれらしいライブラリがあったのだが、自分の欲しい種別が判定できなかったり、動作が遅いため自作することにした。(自分は拡張子なしファイルの中身が、ms office,csv,音声,動画,画像,圧縮ファイルなのか、その他テキストなのか,ショートカットなのか、未知のバイナリなのかをサクッと区別したい)
中身を評価したファイル種別判定
https://en.m.wikipedia.org/wiki/List_of_file_signatures
を参考にファイルヘッダをある程度調べることができる
ソース
関数「guesstype」にターゲットファイルパスを渡すとファイル種別を戻すというもの。
以下は、スクリプト実行できるようにしてみた。
一応動くしそれなりに速いがコードが散らかってる。。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
from io import BytesIO, StringIO
import csv
from chardet import detect
def is_tar(b:bytes):
return b[257:262] == b"ustar"# and b[262] in [b"\x00", b"\x04"]
def is_lha(b:bytes):
return b[0] == b"!"[0] and b[2:5] == b"-lh" and b[6] == b"-"[0]
def is_xls(b:bytes):
# is xls
if b[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
s = 2**sum(b[30:32]) * sum(b[48:50]) + 640
mg = b[s:s+16][::2]
return mg == b"Workbook" or mg[:4] == b"Book"
# is xlsx
if b[:2] == b"PK":
if b[30:49] == b"[Content_Types].xml":
return b"\x00xl/" in b
if b[30:].startswith(b"mimetypeapplication/vnd.oasis.opendocument.spreadsheet"):
return True
return False
def is_doc(b:bytes):
# is doc
if b[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
return b[512:514] == b"\xec\xa5"
# is docx
if b[:2] == b"PK":
if b[30:49] == b"[Content_Types].xml":
return b"\x00word/" in b
if b[30:].startswith(b"mimetypeapplication/vnd.oasis.opendocument.text"):
return True
return False
def is_ppt(b:bytes):
# not xls and not word ==> ppt
if b[:8] == b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
if b[512:514] == b"\xec\xa5":
return False
s = 2**sum(b[30:32]) * sum(b[48:50]) + 640
mg = b[s:s+16][::2]
if mg == b"Workbook" or mg[:4] == b"Book":
return False
if mg:
return True
if b[:2] == b"PK":
# is pptx?
if b[30:49] == b"[Content_Types].xml" or b[30:34] == b"ppt/":
return b"\x00ppt/" in b
if b[30:].startswith(b"mimetypeapplication/vnd.oasis.opendocument.presentation"):
return True
return False
def is_text(b:bytes):
return b"\x00" not in b
def is_bin(b:bytes):
return b"\x00" in b
def is_xml(b:bytes):
return is_text(b) and b.lstrip(b"\xef\xbb\xbf")[:13] == b"<?xml version" and b.rstrip()[-1] == 62 # 62 is `>`
def is_html(b:bytes):
return is_text(b) and b.lstrip(b"\xef\xbb\xbf")[0] == b"<" and b"<html" in b or b"<!doctype" in b and b.rstrip()[-1] == 62 # 62 is `>`
def is_json(b:bytes):
return is_text(b) and b.lstrip(b"\xef\xbb\xbf")[0] == b"{" and b":" in b and b.rstrip()[-1] == 125 # 125 is `}`
sniffer=csv.Sniffer()
sniffer.preferred = [',', '\t', ';', ' ', ':', '|']
def is_csv(b:bytes):
try:
e = detect(b)["encoding"]
d = sniffer.sniff(b.decode(e) if e else b.decode())
return d.delimiter in sniffer.preferred
except csv.Error:
return False
""" referenced by
https://en.m.wikipedia.org/wiki/List_of_file_signatures
"""
match = { # bytes regex match define
b'FO': [
[re.compile(b'FORM....AIFF').match, 'aiff'],
[re.compile(b'FORM....ANBM').match, 'anbm'],
[re.compile(b'FORM....ANIM').match, 'anim'],
[re.compile(b'FORM....CMUS').match, 'cmus'],
[re.compile(b'FORM....FANT').match, 'fant'],
[re.compile(b'FORM....FAXX').match, 'faxx'],
[re.compile(b'FORM....FTXT').match, 'ftxt'],
[re.compile(b'FORM....ILBM').match, 'ilbm'],
[re.compile(b'FORM....SMUS').match, 'smus'],
[re.compile(b'FORM....YUVN').match, 'yuvn']],
b'RI': [
[re.compile(b'RIFF....AVI ').match, 'avi'],
[re.compile(b'RIFF....WAVE').match, 'wav'],
[re.compile(b'RIFF....WEBP').match, 'webp']],
b'\xff\xd8': [
[re.compile(b'\xff\xd8\xff\xdb\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\xff\xd8\xff\xee\xff\xd8\xff\xe1..Exif\x00\x00').match, 'jpg']],
}
start = { # bytes startswith define
b'\x00\x00': [
[b'\x00\x00\x01\x00', 'icon'],
[b'\x00\x00\x01\xba', 'mpg']],
b'\x00\x01': [
[b'\x00\x01\x00\x00Standard ACE DB\x00', 'accdb'],
[b'\x00\x01\x00\x00Standard Jet DB\x00', 'mdb'],
[b'\x00\x01\x00\x00', 'palmdata'],
[b'\x00\x01BD', 'palmarchivedata'],
[b'\x00\x01DT', 'palmcalenderdata']],
b'\x00a': [[b'\x00asm', 'asm']],
b'\x04"': [[b'\x04"M\x18', 'lz4']],
b'\x05\x07': [[b'\x05\x07\x00\x00BOBO\x05\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', 'cwk']],
b'\x06\x07': [[b'\x06\x07\xe1\x00BOBO\x06\x07\xe1\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', 'cwk']],
b'\n\r': [[b'\n\r\r\n', 'pcapng']],
b'\x1aE': [[b'\x1aE\xdf\xa3', 'mkv']],
b'\x1bL': [[b'\x1bLua', 'luac']],
b'\x1f\x8b': [[b'\x1f\x8b', 'gz']],
b'\x1f\x9d': [[b'\x1f\x9d', 'Z']],
b'\x1f\xa0': [[b'\x1f\xa0', 'Z']],
b' \x02': [[b' \x02\x01b\xa0\x1e\xab\x07\x02\x00\x00\x00', 'tde']],
b'!<': [[b'!<arch>', 'linux deb file']],
b'$S': [[b'$SDI0001', 'System Deployment Image']],
b'%!': [[b'%!PS', 'ps']],
b'%P': [[b'%PDF-', 'pdf']],
b"'\x05": [[b"'\x05\x19V", 'U-Boot / uImage. Das U-Boot Universal Boot Loader.']],
b'(\xb5': [[b'(\xb5/\xfd', 'Z']],
b'0&': [[b'0&\xb2u\x8ef\xcf\x11\xa6\xd9\x00\xaa\x00b\xcel', 'asf']],
b'0\x82': [[b'0\x82', 'der']],
b'7H': [[b'7H\x03\x02\x00\x00\x00\x00X509KEY', 'kdb']],
b'7z': [[b"7z\xbc\xaf'\x1c", '7z']],
b'8B': [[b'8BPS', 'psd']],
b':)': [[b':)\n', 'Smile file']],
b'AG': [[b'AGD3', 'fh8']],
b'BA': [[b'BACKMIKEDISK', 'File or tape containing a backup done with AmiBack on an Amiga. It typically is paired with an index file ']],
b'BM': [[b'BM', 'bmp']],
b'BP': [[b'BPG\xfb', 'Better Portable Graphics format']],
b'BZ': [[b'BZh', 'bz2']],
b'CW': [[b'CWSFWS', 'swf']],
b'Cr': [[b'Cr24', 'Google Chrome extension']],
b'DC': [[b'DCM\x01PA30', 'Windows Update Binary Delta Compression']],
b'EM': [[b'EMU3', 'Emulator III synth samples'], [b'EMX2', 'Emulator Emaxsynth samples']],
b'ER': [[b'ER\x02\x00\x00\x00\x8bER\x02\x00\x00\x00', 'Roxio Toast disc image file']],
b'FL': [[b'FLIF', 'Free Lossless Image Format']],
b'GI': [[b'GIF87aGIF89a', 'gif']],
b'ID': [[b'ID3', 'mp3']],
b'II': [[b'II*\x00', 'tiff'],
[b'II*\x00\x10\x00\x00\x00CR', 'Canon RAW Format Version 2']],
b'IN': [[b'INDX', 'Index file to a file or tape containing a backup done with AmiBack on an Amiga.']],
b'L\x00': [[b'L\x00\x00\x00', 'lnk']],
b'KD': [[b'KDM', 'vmdk']],
b'LZ': [[b'LZIP', 'lzip']],
b'MI': [[b'MIL ', '"SEAN\xa0: Session Analysis" Training file. Also used in compatible software "Rpw\xa0: Rowperfect for Windows" and "RP3W\xa0: ROWPERFECT3 for Windows".']],
b'ML': [[b'MLVI', 'Magic Lantern Video file']],
b'MS': [[b'MSCF', 'cab']],
b'MT': [[b'MThd', 'midi']],
b'MZ': [[b'MZ', 'exe']],
b'NE': [[b'NES\x1a', 'Nintendo Entertainment System ROM file']],
b'OR': [[b'ORC', 'Apache ORC ']],
b'Ob': [[b'Obj\x01', 'Apache Avro binary file format']],
b'Og': [[b'OggS', 'Ogg']],
b'PA': [[b'PAR1', 'Apache Parquet columnar file format']],
b'PK': [[b'PK\x03\x04', 'zip'], [b'PK\x05\x06', 'zip empty archive'],
[b'PK\x07\x08', 'zip spanned archive']],
b'PM': [[b'PMOCCMOC', 'Windows Files And Settings Transfer Repository']],
b'RN': [[b'RNC\x01RNC\x02', 'Compressed file using Rob Northen Compression ']],
b'Ra': [[b'Rar!\x1a\x07\x00', 'rar'],
[b'Rar!\x1a\x07\x01\x00', 'rar']],
b'Re': [[b'Received', 'Email Message var5']],
b'SE': [[b'SEQ6', 'RCFile columnar file format']],
b'SI': [[b'SIMPLE = T', 'Flexible Image Transport System ']],
b'SP': [[b'SP01', 'Amazon Kindle Update Package ']],
b'SQ': [[b'SQLite format 3\x00', 'sqlite3']],
b'SZ': [[b"SZDD\x88\xf0'3", 'Microsoft compressed file in Quantum format']],
b'TA': [[b'TAPE', 'Microsoft Tape Format']],
b'TD': [[b'TDEF', 'Telegram Desktop Encrypted File'],
[b'TDF$', 'Telegram Desktop File']],
b'UU': [[b'UU\xaa\xaa', 'PhotoCap Vector']],
b'XP': [[b'XPDS', 'SMPTE DPX image']],
b'[Z': [[b'[ZoneTransfer]', 'Microsoft Zone Identifier for URL Security Zones']],
b'bo': [[b'book\x00\x00\x00\x00mark\x00\x00\x00\x00', 'macOS file Alias']],
b'bv': [[b'bvx2', 'LZFSE - Lempel-Ziv style data compression algorithm using Finite State Entropy coding. OSS by Apple.']],
b'de': [[b'dex\n035\x00', 'Dalvik Executable']],
b'e\x87': [[b'e\x87xV', 'PhotoCap Object Templates']],
b'fL': [[b'fLaC', 'Free Lossless Audio Codec']],
b'\x00m': [[b'\x00mlocate', "locate"]],
b'to': [[b'tox3', 'Open source portable voxel file']],
b'v/': [[b'v/1\x01', 'OpenEXR image']],
b'wO': [[b'wOF2', 'WOFF File Format 2.0'],
[b'wOFF', 'WOFF File Format 1.0']],
b'x\x01': [[b'x\x01s\rbb`', 'dmg'],
[b'x\x0178\x9c78\xda', 'No Compression/low Default Compression Best Compression']],
b'xV': [[b'xV4', 'PhotoCap Template']],
b'xa': [[b'xar!', 'eXtensible ARchive format']],
b'{\\': [[b'{\\rtf1', 'rtf']],
b'\x7fE': [[b'\x7fELF', 'Executable and Linkable Format']],
b'\x80*': [[b'\x80*_\xd7', 'Kodak Cineon image']],
b'\x80\x02': [[b'\x80\x02', 'pickle']],
b'\x80\x03': [[b'\x80\x03', 'pickle']],
b'\x80\x04': [[b'\x80\x04', 'pickle']],
b'\x80\x05': [[b'\x80\x05', 'pickle']],
b'\x89P': [[b'\x89PNG\r\n\x1a\n', 'Image encoded in the Portable Network Graphics format']],
b'\x89H': [[b'\x89HDF\r\n\x1a\n', 'hdf5']],
b'\x96\xd5': [[b'\x96\xd5u!', 'sarbin']],
b'\xa1\xb2': [[b'\xa1\xb2\xc3\xd4\xd4\xc3\xb2\xa1', 'Libpcap File Format']],
b'\xbe\xba': [[b'\xbe\xba\xfe\xca', 'palmcalenderdata']],
b'\xca\xfe': [[b'\xca\xfe\xba\xbe', 'javaclass']],
b'\xce\xfa': [[b'\xce\xfa\xed\xfe', 'Mach-O binary ']],
b'\xcf\x84': [[b'\xcf\x84\x01', 'jpg']],
b'\xcf\xfa': [[b'\xcf\xfa\xed\xfe', 'Mach-O binary ']],
b'\xd0\xcf': [[b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1', 'Microsoft Office 2003older']],
b'\xd4\xc3': [[b'\xd4\xc3\xb2\xa1', 'pcap']],
b'\xed\xab': [[b'\xed\xab\xee\xdb', 'rpm ']],
b'\xef\xbb': [[b'\xef\xbb\xbf', 'UTF-8 encoded Unicode byte order mark']],
b'\xfd7': [[b'\xfd7zXZ\x00\x00', 'xz']],
b'\xfe\xed': [[b'\xfe\xed\xfe\xed', 'JKS JavakeyStore']],
b'\xff\xfb': [[b'\xff\xfb', 'mp3']],
b'\xff\xfe': [[b'\xff\xfe', 'Byte-order mark for text file encoded in little-endian 16-bit Unicode Transfer Format'],
[b'\xff\xfe\x00\x00', 'Byte-order mark for text file encoded in little-endian 32-bit Unicode Transfer Format']]
}
def lookuptype(x:bytes):
if is_bin(x):
k = x[:2]
if k == b"PK":
if is_doc(x): return "docx"
if is_xls(x): return "xlsx"
if is_ppt(x): return "pptx"
elif k == b"\xd0\xcf":
if is_doc(x): return "doc"
if is_xls(x): return "xls"
if is_ppt(x): return "ppt"
elif is_tar(x):
return "tar"
elif is_lha(x):
return 'lha'
try:
if k in start:
return next(d for s, d in start[k] if x.startswith(s))
if k in match:
return next(d for f, d in match[k] if f(x))
return None
except StopIteration:
return None
if is_xml(x): return "xml"
if is_html(x): return "html"
if is_json(x): return "json"
if is_csv(x): return "csv"
return "txt"
def headtail(fp, buf):
ret = fp.read(buf)
try:
fp.seek(-1 * buf, 2)
ret += fp.read()
except OSError:
ret += fp.read()[-1 * buf:]
return ret
def guesstype(f):
buf = 516
check = lambda *tp: isinstance(f, tp)
if hasattr(f, "seek"):
pos = f.tell()
ret = b""
klass = f.__class__.__name__
if check(str) or hasattr(f, "joinpath"):
with open(f, "rb") as fp:
ret = headtail(fp, buf)
elif check(BytesIO) or klass in ["ExFileObject", "ZipExtFile"]:
ret = headtail(f, buf)
elif check(bytearray, bytes):
ret = f[:buf] + f[-1 * buf:]
elif check(StringIO):
e = f.encoding
f.seek(0)
ret = headtail(f, buf)
if e:
ret = ret.encode(e)
else:
ret = ret.encode()
else:
try:
m = f.mode
except AttributeError:
m = f._mode
if isinstance(m, int) or "b" in m:
ret = headtail(f, buf)
else:
with open(f.name, mode=m + "b") as fp:
ret = headtail(fp, buf)
if hasattr(f, "seek"):
f.seek(pos)
if not ret:
return "ZERO"
_type = lookuptype(ret)
if _type == "Microsoft Office 2003older":
return os.path.splitext(hasattr(f, "name") and f.name or f)[1][1:]
return _type
def main():
import sys
from glob import glob
from argparse import ArgumentParser
ps = ArgumentParser(prog="guesstype",
description="guess filetype program\n")
padd = ps.add_argument
padd("-v", "--verbose", help="print progress",
action='store_true', default=False)
padd("files",
metavar="<files>",
nargs="+", default=[],
help="text dump any files")
args = ps.parse_args()
def walk(args):
for arg in args.files:
for f in glob(arg):
f = os.path.normpath(f)
if args.verbose:
sys.stderr.write("Varidating:{}\n".format(f))
sys.stderr.flush()
yield f
i = None
for i, f in enumerate(walk(args)):
print(guesstype(f))
if i is None:
raise FileNotFoundError(str(args.files))
if __name__ == "__main__":
main()
- csv,xml,html,jsonの判定はあくまでぽいものの判定であって、真に妥当かどうかはライブラリーでパースすべし。
- wikipediaのファイルヘッダでは最大262バイトを先頭から読めば大抵のファイルは特定できる模様
- MS Accessの実装を忘れてたがそんなもの使ってないか、、
- OpenOfficeのファイルもあったので判定方法後日考えるか、、