Conditionally replace values ​​in one list using another list of different lengths and ranges based on% age match in python

One true text file contains the following values:

0.000000    3.810000    Three
3.810000    3.910923    NNNN
3.910923    5.429000    AAAA
5.429000    7.060000    AAAA
7.060000    8.411000    MMMM
8.411000    8.971000    MMMM
8.971000    13.40600    MMMM
13.40600    13.82700    Zero
13.82700    15.935554   One

Another text file "Test" contains the following values:

0.000000    3.810000    Three
3.810000    3.910923    Three
3.910923    5.429000    AAAA
5.429000    7.060000    Three
7.060000    8.411000    Three
8.411000    8.971000    Zero
8.971000    13.40600    Three
13.40600    13.82700    Zero
13.82700    15.935554   Two
15.935554   20.138337   Two 

Now I want to replace the tags in the test with tags MMMMfrom the Truth.

The working code that I still have is:

### Assuming I have already read in both the files into truth and test

res = []

for j in range(len(truth)):
    if truth[j][2]== 'MMMM' and truth[j][0]==test[j][0] and truth[j][1]==test[j][1]:
        res.append((test[j][0], test[j][1],truth[j][2]))
    else:
        res.append((test[j][0], test[j][1],test[j][2]))
for i in range(len(res)):
    print res[i]

My code is ugly, but works fine as long as the ranges match well. However, I am not sure how to proceed if the file is much longer than the test file, i.e. the number of intervals and marks is greater.

For example, my rights file might look like this:

    0.000000    1.00000     MMMM
    1.000       3.810000    Three
    3.810000    3.910923    NNNN
    3.910923    5.429000    AAAA
    5.429000    6.0000      MMMM
    6.0000      7.060000    AAAA
    7.060000    8.411000    MMMM
    8.411000    8.971000    MMMM
    8.971000    11.00       abcd
    11.00       13.40600    MMMM
    13.40600    13.82700    Zero
    13.82700    15.935554   One

In such a scenario, how do I accurately update / replace tags with minimal data loss?

, , , 80% MMMM ? , .

+3
3

, , , , , " " , "" j - .

[j] test [k] ( , ). , , .

, 1 ( while while "value test[k] in range of value truth[j]) , .

, , [k] [j], continue j ( ).

,


l_truth = len(truth)
l_test = len(test)

count = 0

res = []

for j in range(l_truth):
    count2= count
    for k in range(count2,l_test):
        if truth[j][2]== 'MMMM': 
            min_truth = truth[j][0]
            max_truth = truth[j][1]
            min_test = test[k][0]
            max_test = test[k][1]

            #diff_truth = max_truth - min_truth
            diff_test = max_test - min_test

            if (min_truth <= min_test) and (max_truth >= max_test):
                res.append((test[k][0], test[k][1],truth[j][2]))
                count +=1
            elif (min_truth <= min_test) and (max_truth <= max_test):
                #diff_min = min_truth - min_test
                diff_max = max_test - max_truth
                ratio = diff_max/diff_test
                if ratio <= 0.2:
                    res.append((test[k][0], test[k][1],truth[j][2]))
                    count +=1
            elif (min_truth >= min_test) and (max_truth >= max_test):
                diff_min = min_truth - min_test
                #diff_max = max_test - max_truth
                ratio = diff_min/diff_test
                if ratio <= 0.2:
                    res.append((test[k][0], test[k][1],truth[j][2]))
                    count+=1
            elif (min_truth >= min_test) and (max_truth <= max_test):
                diff_min = min_truth - min_test
                diff_max = max_test - max_truth
                ratio = (diff_min+diff_max)/diff_test
                if ratio <= 0.2:
                    res.append((test[k][0], test[k][1],truth[j][2]))
                    count+=1
            else:
                pass
        else:
            continue

for i in range(len(res)):
    print res[i]

, . , , .

+2

"" - :

raw_test = [[0.000000   , 3.810000  ,  'Three'],
        [3.810000   , 3.910923  ,  'Three'],
        [3.910923   , 5.429000  ,  'AAAA '],
        [5.429000   , 7.060000  ,  'Three'],
        [7.060000   , 8.411000  ,  'Three'],
        [8.411000   , 8.971000  ,  'Zero'],
        [8.971000   , 13.40600  ,  'Three'],
        [13.40600   , 13.82700  ,  'Zero'], 
        [13.82700   , 15.935554 ,  'Two'], 
        [15.935554  , 20.138337 ,  'Two'],]

raw_truth = [[0.000000 ,   1.00000   ,  'MMMM'],
   [1.000    ,   3.810000  ,  'Three'],
   [3.810000 ,   3.910923  ,  'NNNN'],
   [3.910923 ,   5.429000  ,  'AAAA'],
   [5.429000 ,   6.0000    ,  'MMMM'],
   [6.0000   ,   7.060000  ,  'AAAA'],
   [7.060000 ,   8.411000  ,  'MMMM'],
   [8.411000 ,   8.971000  ,  'MMMM'],
   [8.971000 ,   11.00     ,  'abcd'],
   [11.00    ,   13.40600  ,  'MMMM'],
   [13.40600 ,   13.82700  ,  'Zero'],
   [13.82700 ,   15.935554 ,  'One'],]

truth = {}
for mi,ma,key in raw_truth:
  truth.setdefault((mi,ma), key)

test = [ (mi,ma,ma - mi,lab) for mi,ma,lab in raw_test ]

overlap = []
overlap.append(["test-min","test-max","test-size","test-lab",
                "#","truth-min","truth-max","truth-lab",
                "#","min-over","max-over","over-size","%"])

for mi,ma,siz,lab in test:
  for key in truth:
    truMi,truMa = key
    truVal = truth[key]

    if  ma >= truMi and ma <=truMa or mi >= truMi and mi <=truMa: # coarse filter
      minOv = max(truMi,mi)
      maxOv = min(truMa,ma)
      sizOv = maxOv-minOv
      perc = sizOv/(siz/100.0)
      if perc > 0: # fine filter
        overlap.append([mi,ma,siz,lab,
                        '#',truMi,truMa,truVal,
                        '#',minOv,maxOv, sizOv, perc ])

# just some printing:    
print(truth)
print()    

print(test)
print()    

for d in overlap:
  for x in d:
    if type(x) is str:
      if x == '#':
        print( '  |  ', end ="")    
       else:
        print( '{:<10}'.format(x), end ="")  
    else:
      print( '{:<10.5f}'.format(x), end ="")
  print(" %")

# the print statements are python3 - at the time this answer was written, the question
# had no python 2 tag. Replace the python 3 print statements with
#    print '  |  ',
#    print '{:<10}'.format(x),  
#    print '{:<10.5f}'.format(x),    
# etc. or adapt them accordingly - see https://stackoverflow.com/a/2456292/7505395

:

test-min  test-max  test-size test-lab    |  truth-min truth-max truth-lab   |  min-over  max-over  over-size %          %
0.00000   3.81000   3.81000   Three       |  0.00000   1.00000   MMMM        |  0.00000   1.00000   1.00000   26.24672   %
0.00000   3.81000   3.81000   Three       |  1.00000   3.81000   Three       |  1.00000   3.81000   2.81000   73.75328   %
3.81000   3.91092   0.10092   Three       |  3.81000   3.91092   NNNN        |  3.81000   3.91092   0.10092   100.00000  %
3.91092   5.42900   1.51808   AAAA        |  3.91092   5.42900   AAAA        |  3.91092   5.42900   1.51808   100.00000  %
5.42900   7.06000   1.63100   Three       |  5.42900   6.00000   MMMM        |  5.42900   6.00000   0.57100   35.00920   %
5.42900   7.06000   1.63100   Three       |  6.00000   7.06000   AAAA        |  6.00000   7.06000   1.06000   64.99080   %
7.06000   8.41100   1.35100   Three       |  7.06000   8.41100   MMMM        |  7.06000   8.41100   1.35100   100.00000  %
8.41100   8.97100   0.56000   Zero        |  8.41100   8.97100   MMMM        |  8.41100   8.97100   0.56000   100.00000  %
8.97100   13.40600  4.43500   Three       |  8.97100   11.00000  abcd        |  8.97100   11.00000  2.02900   45.74972   %
8.97100   13.40600  4.43500   Three       |  11.00000  13.40600  MMMM        |  11.00000  13.40600  2.40600   54.25028   %
13.40600  13.82700  0.42100   Zero        |  13.40600  13.82700  Zero        |  13.40600  13.82700  0.42100   100.00000  %
13.82700  15.93555  2.10855   Two         |  13.82700  15.93555  One         |  13.82700  15.93555  2.10855   100.00000  %

: , , - . . truth-lab, - % .

+2

, , , .

, . , , :

def in_range(truth_item, test_item):
    return truth_item[0] <= test_item[0] and truth_item[1] >= test_item[1]


def update_test_items(truth_items, test_items):
    current_truth_index = 0
    for test_item in test_items:
        while not in_range(truth_items[current_truth_index], test_item):
            current_truth_index += 1
            if current_truth_index >= len(truth_items):
                return

        test_item[2] = truth_items[current_truth_index][2]


update_test_items(truth, test)

update_test_items , .

Now you can set the condition for the update if you want, say, 80% coverage and leave the value unchanged if this is not done.

def has_enough_coverage(truth_item, test_item):
    truth_item_size = truth_item[1] - truth_item[0]
    test_item_size = test_item[1] - test_item[0]
    return test_item_size / truth_item_size >= .8


def in_range(truth_item, test_item):
    return truth_item[0] <= test_item[0] and truth_item[1] >= test_item[1]


def update_test_items(truth_items, test_items):
    current_truth_index = 0
    for test_item in test_items:
        while not in_range(truth_items[current_truth_index], test_item):
            current_truth_index += 1
            if current_truth_index >= len(truth_items):
                return

        if has_enough_coverage(truth_items[current_truth_index], test_item):
            test_item[2] = truth_items[current_truth_index][2]


update_test_items(truth, test)

This will update the test item only if it covers 80% + of the truth range.

Please note that they will only work if the initial assumptions are correct, otherwise you will run into problems. This approach will also work very efficiently O (N) time.

+1
source

Source: https://habr.com/ru/post/1693895/


All Articles