Split large csv file by column value in python

I have a large csv file that I cannot process in memory using python. I split it into several pieces after grouping by the value of a specific column using the following logic:

def splitDataFile(self, data_file): self.list_of_chunk_names = [] csv_reader = csv.reader(open(data_file, "rb"), delimiter="|") columns = csv_reader.next() for key,rows in groupby(csv_reader, lambda row: (row[1])): file_name = "data_chunk"+str(key)+".csv" self.list_of_chunk_names.append(file_name) with open(file_name, "w") as output: output.write("|".join(columns)+"\n") for row in rows: output.write("|".join(row)+"\n") print "message: list of chunks ", self.list_of_chunk_names return 

Logic works, but it is slow. I am wondering how can I optimize this? For example, with pandas?

Edit

Further explanation: I am not looking for a simple split into pieces of the same size (for example, each of which has 1000 rows), I want to split by the column value, so I use groupby.

+5
source share
4 answers

I am going to do something like the following, where I repeat the unique values โ€‹โ€‹of the column that needs to be split in order to filter out pieces of data.

 def splitWithPandas(data_file, split_by_column): values_to_split_by = pd.read_csv(data_file, delimiter="|", usecols=[split_by_column]) values_to_split_by.drop_duplicates() values_to_split_by = pd.unique(values_to_split_by.values.ravel()) for i in values_to_split_by: iter_csv = pd.read_csv(data_file, delimiter="|", chunksize=100000) df = pd.concat([chunk[chunk[split_by_column] == i] for chunk in iter_csv]) df.to_csv("data_chunk_"+i, sep="|", index=False) 
+1
source

Use this Python 3 program :

  #!/usr/bin/env python3 import binascii import csv import os.path import sys from tkinter.filedialog import askopenfilename, askdirectory from tkinter.simpledialog import askinteger def split_csv_file(f, dst_dir, keyfunc): csv_reader = csv.reader(f) csv_writers = {} for row in csv_reader: k = keyfunc(row) if k not in csv_writers: csv_writers[k] = csv.writer(open(os.path.join(dst_dir, k), mode='w', newline='')) csv_writers[k].writerow(row) def get_args_from_cli(): input_filename = sys.argv[1] column = int(sys.argv[2]) dst_dir = sys.argv[3] return (input_filename, column, dst_dir) def get_args_from_gui(): input_filename = askopenfilename( filetypes=(('CSV', '.csv'),), title='Select CSV Input File') column = askinteger('Choose Table Column', 'Table column') dst_dir = askdirectory(title='Select Destination Directory') return (input_filename, column, dst_dir) if __name__ == '__main__': if len(sys.argv) == 1: input_filename, column, dst_dir = get_args_from_gui() elif len(sys.argv) == 4: input_filename, column, dst_dir = get_args_from_cli() else: raise Exception("Invalid number of arguments") with open(input_filename, mode='r', newline='') as f: split_csv_file(f, dst_dir, lambda r: r[column-1]+'.csv') # if the column has funky values resulting in invalid filenames # replace the line from above with: # split_csv_file(f, dst_dir, lambda r: binascii.b2a_hex(r[column-1].encode('utf-8')).decode('utf-8')+'.csv') 

Save it as split-csv.py and run it from Explorer or from the line command.

For example, to split superuser.csv based on column 1 and write the output files to dstdir use:

  python split-csv.py data.csv 1 dstdir 

If you run it without arguments, the Tkinter-based GUI will offer you to select the input file, column (index based on 1) and the destination directory.

ref

+3
source

You will probably get maximum performance using the built-in chunking pandas functions (keyword chunksize arg to read_csv )

http://pandas.pydata.org/pandas-docs/version/0.16.2/generated/pandas.read_csv.html

For instance,

 reader = pd.read_table('my_data.csv', chunksize=4) for chunk in reader: print(chunk) 

EDIT:

It can lead you somewhere

 import pandas as pd group_col_indx = 1 group_col = pd.read_csv('test.csv', usecols=[group_col_indx]) keys = group_col.iloc[:,0].unique() for key in keys: df_list = [] reader = pd.read_csv('test.csv', chunksize=2) for chunk in reader: good_rows = chunk[chunk.iloc[:,group_col_indx] == key] df_list.append(good_rows) df_key = pd.concat(df_list) 
+1
source

I suspect your biggest bottleneck is opening and closing the file descriptor every time you process a new block of lines. The best approach if the number of files you write is not too large is to keep all files open. Here's the diagram:

 def splitDataFile(self, data_file): open_files = dict() input_file = open(data_file, "rb") try: ... csv_reader = csv.reader(input_file, ...) ... for key, rows in groupby(csv_reader, lambda row: (row[1])): ... try: output = open_files[key] except KeyError: output = open(file_name, "w") output.write(...) ... finally: for open_file in open_files.itervalues(): open_file.close() input_file.close() 

Of course, if you have only one group with any key, this will not help. (In fact, this can lead to a worsening situation, because you complete the assembly of files unnecessarily.) The more often you finish writing to one file, the more benefit you will get from this change.

You can combine this with pandas if you want, and use the chunking read_csv or read_table to handle the input.

+1
source

Source: https://habr.com/ru/post/1235537/


All Articles