Removing a character sequence from a large binary using python

Question

Removing a character sequence from a large binary using python

I would like to trim long sequences of the same value from a binary in python. A simple way is to simply read in the file and use re.sub to replace the unwanted sequence. Of course, this will not work with large binary files. Is it possible to do something like numpy?

+3

python numpy binaryfiles

bluegray Oct 21 '08 at 10:21

source share

6 answers

, . - . , numpy, array. , .

, , . ( , .) . , . , numpy, .

+2

S.Lott 21 . '08 10:33

dbr - , , , - , , .

def ReplaceSequence(inFilename, outFilename, oldSeq, newSeq):
 inputFile  = open(inFilename, "rb")
 outputFile = open(outFilename, "wb")

 data = ""
 chunk = 1024

 while 1:
      data = inputFile.read(chunk)
      data = data.replace(oldSeq, newSeq)
      outputFile.write(data)

      inputFile.seek(-len(oldSequence), 1)
      outputFile.seek(-len(oldSequence), 1)

     if len(data) < chunk:
           break

 inputFile.close()
 outputFile.close()

+1

AJMayorga 17 . '09 17:04

AJMayorga , . .

:

def ReplaceSequence(inFilename, outFilename, oldSeq, newSeq):
    inputFile  = open(inFilename, "rb")
    outputFile = open(outFilename, "wb")

data = ""
chunk = 1024

oldSeqLen = len(oldSeq)

while 1:
    data = inputFile.read(chunk)

    dataSize = len(data)
    seekLen= dataSize - data.rfind(oldSeq) - oldSeqLen
    if seekLen > oldSeqLen:
        seekLen = oldSeqLen

    data = data.replace(oldSeq, newSeq)
    outputFile.write(data)
    inputFile.seek(-seekLen, 1) 
    outputFile.seek(-seekLen, 1)

    if dataSize < chunk:
        break

inputFile.close()
outputFile.close()

+1

edasx 28 . '12 18:30

. , ?

, , , , , subprocess "fgrep -o -b <search string>", , python file seek, read write.

0

Alex Coventry 21 . '08 12:48

.

, - . :

import StringIO

def gen_chars(stream):
   while True:
      ch = stream.read(1)
      if ch: 
         yield ch
      else:
         break

def gen_unique_chars(stream):
   lastchar = ''
   for char in gen_chars(stream):
      if char != lastchar:
         yield char
      lastchar=char

def remove_seq(infile, outfile):
   for ch in gen_unique_chars(infile):
      outfile.write(ch)

# Represents a file open for reading
infile  = StringIO.StringIO("1122233333444555")

# Represents a file open for writing
outfile = StringIO.StringIO()

# Will print "12345"
remove_seq(infile, outfile)
outfile.seek(0)
print outfile.read()

0

Triptych 17 . '09 17:23

dbr · Accepted Answer · 2008-10-21T13:18:37+0000

If you don't have memory to execute open("big.file").read(), then numpy will usually not help .. It uses the same memory as python variables (if you have 1 GB of RAM, you can only load 1 GB of data in numpy)

- . f = open("big.file", "rb"), f.read(500), . , / C..

, , . :

target_seq = "567"
input_file = "1234567890"

target_seq.read(5) # reads 12345, doesn't contain 567
target_seq.read(5) # reads 67890, doesn't contain 567

, len(target_seq) , , .

(-!):

while cur_data != "":
    seek_start = 0
    chunk_size = len(target_seq)

    input_file.seek(offset = seek_start, whence = 1) #whence=1 means seek from start of file (0 + offset)
    cur_data = input_file.read(chunk_size) # reads 123
    if target_seq == cur_data:
        # Found it!
        out_file.write("replacement_string")
    else:
        # not it, shove it in the new file
        out_file.write(cur_data)
    seek_start += 1

, ( ).

Removing a character sequence from a large binary using python

More articles: