LoginSignup
0
0

More than 5 years have passed since last update.

zipフォーマットのパース

Last updated at Posted at 2013-08-29

前回作った処理をもうちょっと使えるようにしました。

utility

まずutility関数。split-groupのパラメータ名をちょっと変更しているので再掲です。

(defun read-binary-file (path)
  (with-open-file (in path :element-type '(unsigned-byte 8))
    (let ((bin (make-array (file-length in) :element-type '(unsigned-byte 8))))
      (read-sequence bin in)
      bin)))

(defun take-list (list num)
  (labels ((take-n (list num acc)
             (if (or (<= num 0)
                     (null list))
                 (values (nreverse acc) list)
                 (take-n (cdr list) (1- num) (cons (car list) acc)))))
    (take-n list num '())))

(defun split-group (list num-list &key (rest-p nil))
  (labels ((split-n (list num-list acc)
             (cond ((or (and (not rest-p)
                             (or (null list)
                                 (null num-list)))
                        (and rest-p
                             (null list)))
                    (nreverse acc))
                   ((and rest-p
                         (null num-list))
                    (nreverse (cons list acc)))
                   (t
                    (multiple-value-bind (taken-list rest-list)
                        (take-list list (car num-list))
                      (split-n rest-list (cdr num-list) (cons taken-list acc)))))))
    (split-n list num-list '())))

(defun little-endian-to-num (list)
  (loop for x in list
        for idx upfrom 0
        sum (ash x (* idx 8))))

クラス定義

解析中に何か情報を入れたくなるかもしれないので、基底クラスを定義しています。
また、前回作ったクラスもアクセサを追加してスロット名を一部変更しています。

(defclass <zip-header-base> ()
  ())

(defclass <zip-header> (<zip-header-base>)
  ((version-extract :accessor version-extract)
   (flag :accessor flag)
   (compression :accessor compression)
   (mod-time :accessor mod-time)
   (mod-date :accessor mod-date)
   (crc32 :accessor crc32)
   (compressed-size :accessor compressed-size)
   (uncompressed-size :accessor uncompressed-size)
   (filename-length :accessor filename-length)
   (extrafield-length :accessor extrafield-length)
   (filename :accessor filename)
   (extrafield :accessor extrafield)
   (data :accessor data)))

(defclass <zip-central-header> (<zip-header-base>)
  ((version-made-by :accessor version-made-by)
   (version-extract :accessor version-extract)
   (flag :accessor flag)
   (compression :accessor compression)
   (mod-time :accessor mod-time)
   (mod-date :accessor mod-date)
   (crc32 :accessor crc32)
   (compressed-size :accessor compressed-size)
   (uncompressed-size :accessor uncompressed-size)
   (filename-length :accessor filename-length)
   (extrafield-length :accessor extrafield-length)
   (filecomment-length :accessor filecomment-length)
   (disknumber-start :accessor disknumber-start)
   (internal-attributes :accessor internal-attributes)
   (external-attributes :accessor external-attributes)
   (offset :accessor offset)
   (filename :accessor filename)
   (extrafield :accessor extrafield)
   (filecomment :accessor filecomment)))

(defclass <zip-end-header> (<zip-header-base>)
  ((disk-number :accessor disk-number)
   (first-disk :accessor first-disk)
   (disk-entries :accessor disk-entries)
   (zip-entries :accessor zip-entries)
   (total-size :accessor total-size)
   (offset :accessor offset)
   (comment-size :accessor comment-size)
   (comment :accessor comment)))

データ読み込み

サイズ固定の項目を読み込んだあと、サイズ可変の項目を読み込みます。基本的にどのヘッダも同じ方法です。
クラスインスタンスと残りのデータを多値で返します。read-zip-end-headerは残りのデータが無い想定なので、nilを返します。

(defun read-zip-header (data)
  (let ((items (split-group data '(4 ; signature
                                   2 ; version needed to extract
                                   2 ; general purpose bit flag
                                   2 ; compression method
                                   2 ; last mod file time
                                   2 ; last mod file date
                                   4 ; crc-32
                                   4 ; compressed size
                                   4 ; uncompressed size
                                   2 ; file name length
                                   2 ; extra field length
                                   )
                            :rest-p t ; file name, extra field, data
                                   ))
        (inst (make-instance '<zip-header>)))
    (setf (slot-value inst 'version-extract) (little-endian-to-num (nth 1 items)))
    (setf (slot-value inst 'flag) (little-endian-to-num (nth 2 items)))
    (setf (slot-value inst 'compression) (little-endian-to-num (nth 3 items)))
    (setf (slot-value inst 'mod-time) (little-endian-to-num (nth 4 items)))
    (setf (slot-value inst 'mod-date) (little-endian-to-num (nth 5 items)))
    (setf (slot-value inst 'crc32) (little-endian-to-num (nth 6 items)))
    (setf (slot-value inst 'compressed-size) (little-endian-to-num (nth 7 items)))
    (setf (slot-value inst 'uncompressed-size) (little-endian-to-num (nth 8 items)))
    (setf (slot-value inst 'filename-length) (little-endian-to-num (nth 9 items)))
    (setf (slot-value inst 'extrafield-length) (little-endian-to-num (nth 10 items)))

    (let ((rest-items (split-group (nth 11 items) `(,(slot-value inst 'filename-length)
                                                     ,(slot-value inst 'extrafield-length)
                                                     ,(slot-value inst 'compressed-size))
                                   :rest-p t)))
      (setf (slot-value inst 'filename) (nth 0 rest-items))
      (setf (slot-value inst 'extrafield) (nth 1 rest-items))
      (setf (slot-value inst 'data) (nth 2 rest-items))

      (values inst (nth 3 rest-items)))))

(defun read-zip-central-header (data)
  (let ((items (split-group data '(4 ; signature
                                   2 ; version made by
                                   2 ; version needed to extract
                                   2 ; general purpose bit flag
                                   2 ; compression method
                                   2 ; last mod file time
                                   2 ; last mod file date
                                   4 ; crc-32
                                   4 ; compressed size
                                   4 ; uncompressed size
                                   2 ; file name length
                                   2 ; extra field length
                                   2 ; file comment length
                                   2 ; disk number start
                                   2 ; internal file attributes
                                   4 ; external file attributes
                                   4 ; relative offset of local header
                                   )
                            :rest-p t  ; file name, extra field, file comment
                            ))
        (inst (make-instance '<zip-central-header>)))
    (setf (slot-value inst 'version-made-by) (little-endian-to-num (nth 1 items)))
    (setf (slot-value inst 'version-extract) (little-endian-to-num (nth 2 items)))
    (setf (slot-value inst 'flag) (little-endian-to-num (nth 3 items)))
    (setf (slot-value inst 'compression) (little-endian-to-num (nth 4 items)))
    (setf (slot-value inst 'mod-time) (little-endian-to-num (nth 5 items)))
    (setf (slot-value inst 'mod-date) (little-endian-to-num (nth 6 items)))
    (setf (slot-value inst 'crc32) (little-endian-to-num (nth 7 items)))
    (setf (slot-value inst 'compressed-size) (little-endian-to-num (nth 8 items)))
    (setf (slot-value inst 'uncompressed-size) (little-endian-to-num (nth 9 items)))
    (setf (slot-value inst 'filename-length) (little-endian-to-num (nth 10 items)))
    (setf (slot-value inst 'extrafield-length) (little-endian-to-num (nth 11 items)))
    (setf (slot-value inst 'filecomment-length) (little-endian-to-num (nth 12 items)))
    (setf (slot-value inst 'disknumber-start) (little-endian-to-num (nth 13 items)))
    (setf (slot-value inst 'internal-attributes) (little-endian-to-num (nth 14 items)))
    (setf (slot-value inst 'external-attributes) (little-endian-to-num (nth 15 items)))
    (setf (slot-value inst 'offset) (little-endian-to-num (nth 16 items)))

    (let ((rest-items (split-group (nth 17 items) `(,(slot-value inst 'filename-length)
                                                     ,(slot-value inst 'extrafield-length)
                                                     ,(slot-value inst 'filecomment-length))
                                   :rest-p t)))
      (setf (slot-value inst 'filename) (nth 0 rest-items))
      (setf (slot-value inst 'extrafield) (nth 1 rest-items))
      (setf (slot-value inst 'filecomment) (nth 2 rest-items))

      (values inst (nth 3 rest-items)))))

(defun read-zip-end-header (data)
  (let ((items (split-group data '(4 ; signature
                                   2 ; number of this disk
                                   2 ; number of the disk with the start of the central directory
                                   2 ; number of the entries in the central directory on this disk
                                   2 ; total number of entries in the central directory
                                   4 ; size of the central directory
                                   4 ; offset of start of central directory with respect to the starting disk number
                                   2 ; zip file comment length
                                   )
                            :rest-p t  ; zip file comment
                            ))
        (inst (make-instance '<zip-end-header>)))
    (setf (slot-value inst 'disk-number) (little-endian-to-num (nth 1 items)))
    (setf (slot-value inst 'first-disk) (little-endian-to-num (nth 2 items)))
    (setf (slot-value inst 'disk-entries) (little-endian-to-num (nth 3 items)))
    (setf (slot-value inst 'zip-entries) (little-endian-to-num (nth 4 items)))
    (setf (slot-value inst 'total-size) (little-endian-to-num (nth 5 items)))
    (setf (slot-value inst 'offset) (little-endian-to-num (nth 6 items)))
    (setf (slot-value inst 'comment-size) (little-endian-to-num (nth 7 items)))
    
    (let ((rest-items (split-group (nth 8 items) `(,(slot-value inst 'comment-size)))))
      (setf (slot-value inst 'comment) (nth 1 rest-items))
      
      (values inst '()))))

Data Descriptorの存在

ローカルファイルヘッダの後ろに存在したりしなかったり、シグネチャもあったりなかったり、パーサ泣かせの存在です。
おそらくテープとかに圧縮しながら書き込むときに使っていた項目かなと思います。

seek-next-headerという関数を定義して、"PK"のシグネチャが始まるまで読み飛ばすようにします。

Data Descriptorのサイズは固定なので、シグネチャがあるData Descriptorの場合はシグネチャの判定後に読み飛ばします。シグネチャがないData Descriptorはそもそもseek-next-headerで読み飛ばし済みです。

(defun seek-next-header (data)
  (labels ((begin-seek (d)
             (cond ((null d)
                    data)
                   ((= (car d) #x50)
                    (in-p d))
                   (t
                    (begin-seek (cdr d)))))
           (in-p (d)
             (cond ((null d)
                    data)
                   ((= (cadr d) #x4B)
                    d)
                   (t
                    (begin-seek (cdr d))))))
    (begin-seek data)))

パース

ファイルを最後まで読み込みます。
まず、シグネチャを判定する関数を準備します。

(defun zip-header-p (data)
  (let ((signature (subseq data 0 4)))
    (equal signature '(#x50 #x4B #x03 #x04))))

(defun zip-central-header-p (data)
  (let ((signature (subseq data 0 4)))
    (equal signature '(#x50 #x4B #x01 #x02))))

(defun zip-end-header-p (data)
  (let ((signature (subseq data 0 4)))
    (equal signature '(#x50 #x4B #x05 #x06))))

(defun zip-data-descriptor-p (data)
  (let ((signature (subseq data 0 4)))
    (equal signature '(#x50 #x4B #x07 #x08))))

パース処理を書いていきます。
ファイルを読み込んでそれぞれのクラスを生成してリストで返す関数にします。

(defun parse-zip (data &key (header-fn nil) (central-header-fn nil) (end-header-fn nil))
  (labels ((parse-main (data inst-list)
             (cond 
                   ;; パース終了
                   ((null data)
                    (nreverse inst-list))

                   ;; ローカルヘッダ
                   ((zip-header-p data)
                    (multiple-value-bind (inst rest)
                        (read-zip-header data)
                      (when header-fn
                        (funcall header-fn inst))
                      (parse-main (seek-next-header rest) (cons inst inst-list))))

                   ;; Data Descriptor(読み飛ばし)
                   ((zip-data-descriptor-p data)
                    (parse-main (seek-next-header (subseq data 16)) inst-list))

                   ;; ファイルヘッダ
                   ((zip-central-header-p data)
                    (multiple-value-bind (inst rest)
                        (read-zip-central-header data)
                      (when central-header-fn
                        (funcall central-header-fn inst))
                      (parse-main (seek-next-header rest) (cons inst inst-list))))

                   ;; エンドヘッダ
                   ((zip-end-header-p data)
                    (multiple-value-bind (inst rest)
                        (read-zip-end-header data)
                      (declare (ignore rest))
                      (when end-header-fn
                        (funcall end-header-fn inst))
                      (parse-main nil (cons inst inst-list))))

                   ;; 不明
                   (t
                    (error (format nil "unknown header:~{ ~A~}"
                                   (subseq data 0 (min 4 (length data)))))))))
    (parse-main data '())))

フック処理を用意しているので、簡単な解析とかはこれで対処できます。
以下はzipに格納されているファイルのうち、暗号化されていないファイル名を得る処理です。

CL-USER> (let ((ng-files))
           (parse-zip (coerce (read-binary-file #p"/home/shingo/test.zip") 'list)
                      :header-fn
                      #'(lambda (inst)
                          (when (not (= #x01
                                        (logand #x01 (flag inst))))
                            (push (format nil "~{~C~}" (mapcar #'code-char (filename inst))) ng-files))))
           (nreverse ng-files))

("test.txt")
0
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
0
0