Python 2 `tempfile` decoding with python-future

I am trying to write a Python 2/3 compatible routine to get a CSV file, decode it with latin_1in Unicode and back it up in a csv.DictReaderreliable scalable way.

  • For Python 2/3 support, I use python-future, including imporing openfrom builtins, and import unicode_literalsfor consistent behavior
  • I hope to process exceptionally large files by spilling a disk using tempfile.SpooledTemporaryFile
  • I use io.TextIOWrapperto process decoding from encoding latin_1before serving toDictReader

All of this works great under Python 3.

The problem is what TextIOWrapperawaits the flow around the stream that matches BufferedIOBase. Unfortunately, in Python 2, although I imported Python 3-style open, vanilla Python 2 tempfile.SpooledTemporaryFilestill, of course, returns Python 2 cStringIO.StringOinstead of Python 3 io.BytesIOas required TextIOWrapper.

I can think of these possible approaches:

  • Wrap Python 2 cStringIO.StringOas a 3-style Python io.BytesIO. I'm not sure how to approach this - do I need to write such a shell or does it already exist?
  • Find an alternative to Python 2 for transferring a stream cStringIO.StringOfor decoding. I haven't found it yet.
  • Remove SpooledTemporaryFile, fully decode in memory. How large a CSV file must be in order to work completely in memory to become a problem?
  • SpooledTemporaryFile spill-to-disk. open python-future, , , , .

? - ?


from __future__ import (absolute_import, division,
                    print_function, unicode_literals)
from builtins import (ascii, bytes, chr, dict, filter, hex, input,  # noqa
                  int, map, next, oct, open, pow, range, round,  # noqa
                  str, super, zip)  # noqa
import csv
import tempfile
from io import TextIOWrapper
import requests

Init:

...
self._session = requests.Session()
...

:

def _fetch_csv(self, path):
    raw_file = tempfile.SpooledTemporaryFile(
        max_size=self._config.get('spool_size')
    )
    csv_r = self._session.get(self.url + path)
    for chunk in csv_r.iter_content():
        raw_file.write(chunk)
    raw_file.seek(0)
    text_file = TextIOWrapper(raw_file._file, encoding='latin_1')
    return csv.DictReader(text_file)

:

...in _fetch_csv
    text_file = TextIOWrapper(raw_file._file, encoding='utf-8')
AttributeError: 'cStringIO.StringO' object has no attribute 'readable'
+4
2

, . .

NamedTemporaryFile CSV, UTF-8, , , -quite- , Python 3 io.open.

, NamedTemporaryFile Python 2 , . , , , , , Python 2 3, , io.open. - backports.csv, CSV Python 3 Python 2.

from __future__ import absolute_import, division, print_function, unicode_literals
from builtins import str
import csv, tempfile, io, os
from backports import csv

data = [["1", "1", "John Coltrane",  1926],
        ["2", "1", "Miles Davis",    1926],
        ["3", "1", "Bill Evans",     1929],
        ["4", "1", "Paul Chambers",  1935],
        ["5", "1", "Scott LaFaro",   1936],
        ["6", "1", "Sonny Rollins",  1930],
        ["7", "1", "Kenny Burrel",   1931]]

## create CSV file
with tempfile.NamedTemporaryFile(delete=False) as temp:
    filename = temp.name

with io.open(filename, mode='w', encoding="utf-8", newline='') as temp:
    writer = csv.writer(temp, quoting=csv.QUOTE_NONNUMERIC, lineterminator=str(os.linesep))
    headers = ['X', 'Y', 'Name', 'Born']
    writer.writerow(headers)
    for row in data:
        print(row)
        writer.writerow(row)
+2
,

@cbare. , :

  • tempfile.NamedTemporaryFile() . .
  • with .
  • , ( ), io.open().

, . , (, nt) , , , . , - , .

:

# Create temporary file
with tempfile.NamedTemporaryFile() as tf_oldstyle:
    # get its file descriptor - note that it will also work with tempfile.TemporaryFile
    # which has no meaningful name at all
    fd = tf_oldstyle.fileno()
    # open that fd with io.open, using desired mode (could use binary mode or whatever)
    tf = io.open(fd, 'w+', encoding='utf-8', newline='')
    # note we don't use a with statement here, because this fd will be closed once we leave the outer with block
    # now work with the tf
    writer = csv.writer(tf, ...)
    writer.writerow(...)

# At this point, fd is closed, and the file is deleted.

tempfile.mkstemp(), fd , *TemporaryFile, , .

fd, name = tempfile.mkstemp()
try:
    tf = io.open(fd, 'w+', encoding='utf-8', newline='')
    writer = csv.writer(tf, ...)
    writer.writerow(...)
finally:
    os.close(fd)
    os.unlink(name)

, SpooledTemporaryFile

SpooledTemporaryFile python2 rollover.

: .

import io
import sys
import tempfile

if sys.version_info >= (3,):
    SpooledTemporaryFile = tempfile.SpooledTemporaryFile
else:
    class SpooledTemporaryFile(tempfile.SpooledTemporaryFile):
        def __init__(self, max_size=0, mode='w+b', **kwargs):
            # replace cStringIO with io.BytesIO or io.StringIO
            super(SpooledTemporaryFile, self).__init__(max_size, mode, **kwargs)
            if 'b' in mode:
                self._file = io.BytesIO()
            else:
                self._file = io.StringIO(newline='\n')  # see python3 tempfile sources for reason

        def rollover(self):
            if self._rolled:
                return
            # call super implementation and then replace underlying file object
            super(SpooledTemporaryFile, self).rollover()
            fd = self._file.fileno()
            name = self._file.name
            mode = self._file.mode
            delete = self._file.delete
            pos = self._file.tell()
            # self._file is a tempfile._TemporaryFileWrapper.
            # It caches methods so we cannot just replace its .file attribute,
            # so let create another _TemporaryFileWrapper
            file = io.open(fd, mode)
            file.seek(pos)
            self._file = tempfile._TemporaryFileWrapper(file, name, delete)
+1

Source: https://habr.com/ru/post/1624540/


All Articles