I understand that I had almost the same idea as Winston Evert: creating a regular expression.
But my regex is:
performed when ix_profile < ix_user also cases when ix_profile > ix_user
regex only fixes the user column: the profile column is mapped to the submatrix '"(?!7")[^\t\r\n"]*"' , which does not match if "7" is present in this column; therefore we only get the right user with a single specific group
.
In addition, I tested several matching and extraction algorithms:
1) with re.finditer ()
2) with re.match () , and the regex matches 40 fields
3) with re.match ( ) and matching only regex max (ix_profile, ix_user) + 1 field
4) like 3 , but with a simple dictionary instead of the defaultdict instance
To measure time, my code creates a file based on the information you gave regarding its contents.
.
I tested the following 4 functions in 4 codes:
1
def get_users_short_1(log): users_short = defaultdict(int) f = open(log)
2
def get_users_short_2(log): users_short = defaultdict(int) f = open(log)
3
def get_users_short_3(log): users_short = defaultdict(int) f = open(log)
4
Full code 4 that seems the fastest:
import re from random import choice,randint,sample import csv import random from time import clock choi = 1 if choi: ntot = 1000 chars = 'abcdefghijklmnopqrstuvwxyz0123456789' def ry(a=30,b=80,chars=chars,nom='abcdefghijklmnopqrstuvwxyz'): if a==30: return ''.join(choice(chars) for i in xrange(randint(30,80))) else: return ''.join(choice(nom) for i in xrange(randint(8,12))) num = sample(xrange(1000),200) num.sort() print 'num==',num several = [e//3 for e in xrange(0,800,7) if e//3 not in num] print print 'several==',several with open('biggy.txt','w') as f: head = ('aaa','bbb','ccc','ddd','profile.id','fff','ggg','hhhh','profile.type','iiii', 'jjj','kkkk','lll','mmm','nnn','ooo','ppp','qq','rr','ss', 'tt','uu','vv','ww','xx','yy','zz','razr','fgh','ty', 'kfgh','zer','sdfs','fghf','dfdf','zerzre','jkljkl','vbcvb','kljlk','dhhdh') f.write('\t'.join(head)+'\n') for i in xrange(1000): li = [ ry(a=8).join('""') if n==4 else ry().join('""') for n in xrange(40) ] if i in num: li[4] = '@#~&=*;' li[8] = '"7"' if i in several: li[4] = '"BRAD"' f.write('\t'.join(li)+'\n') from collections import defaultdict def get_users(log): users = defaultdict(int) f = open(log)
One of the results of this code 4 is, for example:
num== [2, 12, 16, 23, 26, 33, 38, 40, 43, 45, 51, 53, 84, 89, 93, 106, 116, 117, 123, 131, 132, 135, 136, 138, 146, 148, 152, 157, 164, 168, 173, 176, 179, 189, 191, 193, 195, 199, 200, 208, 216, 222, 224, 227, 233, 242, 244, 245, 247, 248, 251, 255, 256, 261, 262, 266, 276, 278, 291, 296, 298, 305, 307, 308, 310, 312, 314, 320, 324, 327, 335, 337, 340, 343, 350, 356, 362, 370, 375, 379, 382, 385, 387, 409, 413, 415, 419, 433, 441, 443, 444, 446, 459, 462, 474, 489, 492, 496, 505, 509, 511, 512, 518, 523, 541, 546, 548, 550, 552, 558, 565, 566, 572, 585, 586, 593, 595, 601, 609, 610, 615, 628, 632, 634, 638, 642, 645, 646, 651, 654, 657, 660, 662, 665, 670, 671, 680, 682, 687, 688, 690, 692, 695, 703, 708, 716, 717, 728, 729, 735, 739, 741, 742, 765, 769, 772, 778, 790, 792, 797, 801, 808, 815, 825, 828, 831, 839, 849, 858, 859, 862, 864, 872, 874, 890, 899, 904, 906, 913, 916, 920, 923, 928, 941, 946, 947, 953, 955, 958, 959, 961, 971, 975, 976, 979, 981, 985, 989, 990, 999] several== [0, 4, 7, 9, 11, 14, 18, 21, 25, 28, 30, 32, 35, 37, 39, 42, 44, 46, 49, 56, 58, 60, 63, 65, 67, 70, 72, 74, 77, 79, 81, 86, 88, 91, 95, 98, 100, 102, 105, 107, 109, 112, 114, 119, 121, 126, 128, 130, 133, 137, 140, 142, 144, 147, 149, 151, 154, 156, 158, 161, 163, 165, 170, 172, 175, 177, 182, 184, 186, 196, 198, 203, 205, 207, 210, 212, 214, 217, 219, 221, 226, 228, 231, 235, 238, 240, 249, 252, 254, 259, 263] len(num)== 200 : number of lines with ix_profile=='"7"' USERS['BRAD']== 91 then : 1000 lines - 200 incorrect - 91 identical + 1 user BRAD = 710 len(USERS)== 710 len(USERS_short_4)== 710 USERS == USERS_short_4 is True
But the results are more or less variable. I got:
get_users_short_1() / get_users() = 82.957476637 % get_users_short_1() / get_users() = 82.3987686867 % get_users_short_1() / get_users() = 90.2949842932 % get_users_short_1() / get_users() = 78.8063007461 % get_users_short_1() / get_users() = 90.4743181768 % get_users_short_1() / get_users() = 81.9635560003 % get_users_short_1() / get_users() = 83.9418269406 % get_users_short_1() / get_users() = 89.4344442255 % get_users_short_2() / get_users() = 80.4891442088 % get_users_short_2() / get_users() = 69.921943776 % get_users_short_2() / get_users() = 81.8006709304 % get_users_short_2() / get_users() = 83.6270772928 % get_users_short_2() / get_users() = 97.9821084403 % get_users_short_2() / get_users() = 84.9307558629 % get_users_short_2() / get_users() = 75.9384820018 % get_users_short_2() / get_users() = 86.2964748485 % get_users_short_3() / get_users() = 69.4332754744 % get_users_short_3() / get_users() = 58.5814726668 % get_users_short_3() / get_users() = 61.8011476831 % get_users_short_3() / get_users() = 67.6925083362 % get_users_short_3() / get_users() = 65.1208124156 % get_users_short_3() / get_users() = 72.2621727569 % get_users_short_3() / get_users() = 70.6957501222 % get_users_short_3() / get_users() = 68.5310031226 % get_users_short_3() / get_users() = 71.6529128259 % get_users_short_3() / get_users() = 71.6153554073 % get_users_short_3() / get_users() = 64.7899044975 % get_users_short_3() / get_users() = 72.947531363 % get_users_short_3() / get_users() = 65.6691965629 % get_users_short_3() / get_users() = 61.5194374401 % get_users_short_3() / get_users() = 61.8396133666 % get_users_short_3() / get_users() = 71.5447862466 % get_users_short_3() / get_users() = 74.6710538858 % get_users_short_3() / get_users() = 72.9651233485 % get_users_short_4() / get_users() = 65.5224210767 % get_users_short_4() / get_users() = 65.9023813161 % get_users_short_4() / get_users() = 62.8055210129 % get_users_short_4() / get_users() = 64.9690049062 % get_users_short_4() / get_users() = 61.9050866134 % get_users_short_4() / get_users() = 65.8127125992 % get_users_short_4() / get_users() = 66.8112344201 % get_users_short_4() / get_users() = 57.865635278 % get_users_short_4() / get_users() = 62.7937713964 % get_users_short_4() / get_users() = 66.3440149528 % get_users_short_4() / get_users() = 66.4429530201 % get_users_short_4() / get_users() = 66.8692388625 % get_users_short_4() / get_users() = 66.5949137537 % get_users_short_4() / get_users() = 69.1708488794 % get_users_short_4() / get_users() = 59.7129743801 % get_users_short_4() / get_users() = 59.755297387 % get_users_short_4() / get_users() = 60.6436352185 % get_users_short_4() / get_users() = 64.5023727945 % get_users_short_4() / get_users() = 64.0153937511 %
.
I would like to know what result you would get with my code in your real file with a computer, which is certainly more powerful than mine. Please give me the news.
.
.
EDIT 1
FROM
def get_users_short_Machin(log): users_short = defaultdict(int) f = open(log)
I have
get_users_short_Machin() / get_users() = 60.6771821308 % get_users_short_Machin() / get_users() = 71.9300992989 % get_users_short_Machin() / get_users() = 85.1695214715 % get_users_short_Machin() / get_users() = 72.7722233685 % get_users_short_Machin() / get_users() = 73.6311173237 % get_users_short_Machin() / get_users() = 86.0848484053 % get_users_short_Machin() / get_users() = 75.1661981729 % get_users_short_Machin() / get_users() = 72.8888452474 % get_users_short_Machin() / get_users() = 76.7185685993 % get_users_short_Machin() / get_users() = 82.7007096958 % get_users_short_Machin() / get_users() = 71.1678957888 % get_users_short_Machin() / get_users() = 71.9845835126 %
Using a simple dict:
users_short = {} ....... for line in f: #if i % 1000000 == 0: print "Line %d" % i # progress notification l = line.split('\t', maxsplits) if l[ix_profile] != '"7"': # "7" indicates a bad value # use list slicing to remove quotes us = l[ix_user][1:-1] if us not in users_short: users_short[us] = 1 else: users_short[us] += 1
slightly improves runtime but stays above my last 4 code
get_users_short_Machin2() / get_users() = 71.5959919389 % get_users_short_Machin2() / get_users() = 71.6118864535 % get_users_short_Machin2() / get_users() = 66.3832514274 % get_users_short_Machin2() / get_users() = 68.0026407277 % get_users_short_Machin2() / get_users() = 67.9853921552 % get_users_short_Machin2() / get_users() = 69.8946203037 % get_users_short_Machin2() / get_users() = 71.8260030248 % get_users_short_Machin2() / get_users() = 78.4243267003 % get_users_short_Machin2() / get_users() = 65.7223734428 % get_users_short_Machin2() / get_users() = 69.5903935612 %
.
EDIT 2
The fastest:
def get_users_short_CSV(log): users_short = {} f = open(log,'rb') rid = csv.reader(f,delimiter='\t') # Read header line h = rid.next() ix_profile = h.index('profile.type') ix_user = h.index('profile.id') # If either ix_* is the last field in h, it will include a newline. # That fine for now. glo = (max(ix_profile,ix_user) + 1) * ['[^\t]*'] glo[ix_profile] = '"(?!7")[^\t\r\n"]*"' glo[ix_user] = '"([^\t\r\n"]*)"' regx = re.compile('\t'.join(glo)) for line in f: gugu = regx.match(line) if gugu: gugugroup = gugu.group(1) if gugugroup in users_short: users_short[gugugroup] += 1 else: users_short[gugugroup] = 1 f.close() return users_short
result
get_users_short_CSV() / get_users() = 31.6443901114 % get_users_short_CSV() / get_users() = 44.3536176134 % get_users_short_CSV() / get_users() = 47.2295100511 % get_users_short_CSV() / get_users() = 45.4912200716 % get_users_short_CSV() / get_users() = 63.7997241038 % get_users_short_CSV() / get_users() = 43.5020255488 % get_users_short_CSV() / get_users() = 40.9188320386 % get_users_short_CSV() / get_users() = 43.3105062139 % get_users_short_CSV() / get_users() = 59.9184895288 % get_users_short_CSV() / get_users() = 40.22047881 % get_users_short_CSV() / get_users() = 48.3615872543 % get_users_short_CSV() / get_users() = 47.0374831251 % get_users_short_CSV() / get_users() = 44.5268626789 % get_users_short_CSV() / get_users() = 53.1690205938 % get_users_short_CSV() / get_users() = 43.4022458372 %
.
EDIT 3
I tested get_users_short_CSV () with 10,000 lines in a file instead of 1000:
len(num)== 2000 : number of lines with ix_profile=='"7"' USERS['BRAD']== 95 then : 10000 lines - 2000 incorrect - 95 identical + 1 user BRAD = 7906 len(USERS)== 7906 len(USERS_short_CSV)== 7906 USERS == USERS_short_CSV is True