Python average tabular data

Good. I have the following work program. It opens a data file in columns that is too large for excel and finds the average value for each column:

Sample data:

Joe Sam Bob
1   2   3
2   1   3

And he returns

Joe Sam Bob
1.5 1.5 3

It's good. The problem is that some columns have NA as the value. I want to skip this NA and calculate the average of the remaining values ​​So,

Bobby
1
NA
2

Should output as

Bobby
1.5

Here is my existing program built using here. Any help is appreciated!

with open('C://avy.txt', "rtU") as f:
    columns = f.readline().strip().split(" ")
    numRows = 0
    sums = [0] * len(columns)

    for line in f:
        # Skip empty lines
        if not line.strip():
            continue

        values = line.split(" ")
        for i in xrange(len(values)):
            sums[i] += int(values[i])
        numRows += 1

        with open('c://finished.txt', 'w') as ouf:
             for index, summedRowValue in enumerate(sums):
                 print>>ouf, columns[index], 1.0 * summedRowValue / numRows

Now I have this:

with open ('C: //avy.txt', "rtU") as f:

def get_averages(f):
   headers = f.readline().split()
   ncols = len(headers)
   sumx0 = [0] * ncols
   sumx1 = [0.0] * ncols
   lino = 1

for line in f:
   lino += 1
   values = line.split()

for colindex, x in enumerate(values):
        if colindex >= ncols:
             print >> sys.stderr, "Extra data %r in row %d, column %d" %(x, lino, colindex+1)
             continue
             try:
                value = float(x)
             except ValueError:
               continue
               sumx0[colindex] += 1
        sumx1[colindex] += value
        print headers
print sumx1
print sumx0
averages = [
    total / count if count else None
   for total, count in zip(sumx1, sumx0)
    ]
print averages

and he says:

Traceback (last last call): File "C: /avy10.py", line 11, in lino + = 1 NameError: name 'lino' is not defined

0
5

, ... , . (1), (2), . - "NA" , "NA" .

>>> import sys, StringIO
>>>
>>> data = """\
... Jim Joe Billy Bob
... 1   2   3     x
... 2   x   x     x  666
...
... 3   4   5     x
... """
>>>
>>> def get_averages(f):
...     headers = f.readline().split()
...     ncols = len(headers)
...     sumx0 = [0] * ncols
...     sumx1 = [0.0] * ncols
...     lino = 1
...     for line in f:
...         lino += 1
...         values = line.split()
...         for colindex, x in enumerate(values):
...             if colindex >= ncols:
...                 print >> sys.stderr, "Extra data %r in row %d, column %d" %
(x, lino, colindex+1)
...                 continue
...             try:
...                 value = float(x)
...             except ValueError:
...                 continue
...             sumx0[colindex] += 1
...             sumx1[colindex] += value
...     print headers
...     print sumx1
...     print sumx0
...     averages = [
...         total / count if count else None
...         for total, count in zip(sumx1, sumx0)
...         ]
...     print averages

:

...     return headers, averages

...
>>> sio = StringIO.StringIO(data)
>>> get_averages(sio)
Extra data '666' in row 3, column 5
['Jim', 'Joe', 'Billy', 'Bob']
[6.0, 6.0, 8.0, 0.0]
[3, 2, 2, 0]
[2.0, 3.0, 4.0, None]
>>>

Edit

:

with open('myfile.text') as mf:
   hdrs, avgs = get_averages(mf)
-1

:

text = """Joe Sam Bob
1   2   3
2   1   3
NA 2 3
3 5 NA"""

def avg( lst ):
    """ returns the average of a list """
    return 1. * sum(lst)/len(lst)

# split that text
parts = [line.split() for line in text.splitlines()]
#remove the headers
names = parts.pop(0)
# zip(*m) does something like transpose a matrix :-)
columns = zip(*parts)
# convert to numbers and leave out the NA
numbers = [[int(x) for x in column if x != 'NA' ] for column in columns]
# all left is averaging
averages = [avg(col) for col in numbers]
# and printing
for name, x in zip( names, averages):
    print name, x

, , .

+3

[ ]

, . , 3, Python, .

. . , . , float, ( , ), ValueError.

, , . float, . , .

, " " (, " " ), , . Python, .

, ( ). .

class Accumulator(object):
    """
    Used to accumulate the arithmetic mean of a stream of
    numbers. This implementation does not allow to remove items
    already accumulated, but it could easily be modified to do
    so. also, other statistics could be accumulated.
    """
    def __init__(self):
     # upon initialization, the numnber of items currently
     # accumulated (_n) and the total sum of the items acumulated
     # (_sum) are set to zero because nothing has been accumulated
     # yet.
     self._n = 0
     self._sum = 0.0

    def add(self, item):
     # the 'add' is used to add an item to this accumulator
     try:
        # try to convert the item to a float. If you are
        # successful, add the float to the current sum and
        # increase the number of accumulated items
        self._sum += float(item)
        self._n += 1
     except ValueError:
        # if you fail to convert the item to a float, simply
        # ignore the exception (pass on it and do nothing)
        pass

    @property
    def mean(self):
     # the property 'mean' returns the current mean accumulated in
     # the object
     if self._n > 0:
        # if you have more than zero items accumulated, then return
        # their artithmetic average
        return self._sum / self._n
     else:
        # if you have no items accumulated, return None (you could
        # also raise an exception)
        return None

# using the object:

# Create an instance of the object "Accumulator"
my_accumulator = Accumulator()
print my_accumulator.mean
# prints None because there are no items accumulated

# add one (a number)
my_accumulator.add(1)
print my_accumulator.mean
# prints 1.0

# add two (a string - it will be converted to a float)
my_accumulator.add('2')
print my_accumulator.mean
# prints 1.5

# add a 'NA' (will be ignored because it cannot be converted to float)
my_accumulator.add('NA')
print my_accumulator.mean
# prints 1.5 (notice that it ignored the 'NA')

.

+2

:

    values = line.split(" ")
    for i in xrange(len(values)):
        if values[i] == "NA":
            continue
        sums[i] += int(values[i])
    numRows += 1
-1

:

with open('in', "rtU") as f:
    lines = [l for l in f if l.strip()]
    names = '\t'.join(lines[0].split())
    numbers = [[i.strip() for i in line.split()] for line in lines[1:]]
    person_data = zip(*numbers)
    person_data = [tuple(int(i) for i in t if i!="NA") for t in person_data]
    averages = map(lambda x: str(float(sum(x))/len(x)), person_data)

with open('out', 'w') as f:
    f.write(names)
    f.write('\n')
    f.write('\t'.join(averages))

, . :

  • , , . .
  • I tried to make this line a little more readable, but to be honest, I don’t understand why you called it confusing in the first place.
  • You indicated a logical error in my code. I think I really shouldn't have done this in the middle of the class ... for that I apologize
  • I agree that readlines () is redundant. I did not have a suitable python interpreter to test this, so I left it as security

Hope this is better.

-1
source

Source: https://habr.com/ru/post/1766107/


All Articles