How to unzip / blow PDF stream

Work with 2016-W4 pdf, which has 2 large threads (pages 1 and 2), as well as many other objects and smaller threads. I am trying to blow off stream (s), work with the source data, but I'm afraid. I can only get corrupt data and incorrect checksum errors.

I wrote a test script to help debug, and pulled smaller streams from the stream files.

Here are 2 streams from the original pdf along with their length objects:

stream 1 :

149 0 obj
<< /Length 150 0 R /Filter /FlateDecode /Type /XObject /Subtype /Form /FormType
1 /BBox [0 0 8 8] /Resources 151 0 R >>
stream
x+TT(T0B ,JUWÈS0Ð37±402V(NFJSþ¶
«
endstream
endobj
150 0 obj
42
endobj

stream 2

142 0 obj
<< /Length 143 0 R /Filter /FlateDecode /Type /XObject /Subtype /Form /FormType
1 /BBox [0 0 0 0] /Resources 144 0 R >>
stream
x+Tçã
endstream
endobj
143 0 obj
11
endobj

I copied only the contents streamto the new files inside Vim (excluding carriage returns after streamand before endstream).

I tried both:

  • compress/flate( rfc-1951 ) - (delete the first 2 bytes ( CMF, FLG))
  • compress/zlib( rfc-1950 )

[]byte :

package main

import (
    "bytes"
    "compress/flate"
    "compress/gzip"
    "compress/zlib"
    "fmt"
    "io"
    "os"
)

var (
    flateReaderFn = func(r io.Reader) (io.ReadCloser, error) { return flate.NewReader(r), nil }
    zlibReaderFn  = func(r io.Reader) (io.ReadCloser, error) { return zlib.NewReader(r) }
)

func deflate(b []byte, skip, length int, newReader func(io.Reader) (io.ReadCloser, error)) {
    // rfc-1950
    // --------
    //   First 2 bytes
    //   [120, 1] - CMF, FLG
    //
    //   CMF: 120
    //     0111 1000
    //     ↑    ↑
    //     |    CM(8) = deflate compression method
    //     CINFO(7)   = 32k LZ77 window size
    //
    //   FLG: 1
    //     0001 ← FCHECK
    //            (CMF*256 + FLG) % 31 == 0
    //             120 * 256 + 1 = 30721
    //                             30721 % 31 == 0

    stream := bytes.NewReader(b[skip:length])
    r, err := newReader(stream)
    if err != nil {
        fmt.Println("\nfailed to create reader,", err)
        return
    }

    n, err := io.Copy(os.Stdout, r)
    if err != nil {
        if n > 0 {
            fmt.Print("\n")
        }
        fmt.Println("\nfailed to write contents from reader,", err)
        return
    }
    fmt.Printf("%d bytes written\n", n)
    r.Close()
}

func main() {
    //readerFn, skip := flateReaderFn, 2 // compress/flate RFC-1951, ignore first 2 bytes
    readerFn, skip := zlibReaderFn, 0 // compress/zlib RFC-1950, ignore nothing

    //                                                                                                ⤹ This is where the error occurs: `flate: corrupt input before offset 19`.
    stream1 := []byte{120, 1, 43, 84, 8, 84, 40, 84, 48, 0, 66, 11, 32, 44, 74, 85, 8, 87, 195, 136, 83, 48, 195, 144, 51, 55, 194, 177, 52, 48, 50, 86, 40, 78, 70, 194, 150, 74, 83, 8, 4, 0, 195, 190, 194, 182, 10, 194, 171, 10}
    stream2 := []byte{120, 1, 43, 84, 8, 4, 0, 1, 195, 167, 0, 195, 163, 10}

    fmt.Println("----------------------------------------\nStream 1:")
    deflate(stream1, skip, 42, readerFn) // flate: corrupt input before offset 19

    fmt.Println("----------------------------------------\nStream 2:")
    deflate(stream2, skip, 11, readerFn) // invalid checksum
}

, - , .

(PDF )

+4
2

/ . , , .

, "" PDF, , , PDF. (, hecate - ) , ( ).

# 1:

. , . , , , , /, "", . (, \n, \r\n). , , , , . , , , .

# 2:

flateReaderFn, ( ). , " ", , "" .

+1

, ...

, , Vim . , RFC Go compress/..., , .

, , PDF , stream/endstream deflate. , , .

+1 @icza, .

, , .

+1

Source: https://habr.com/ru/post/1670346/


All Articles