Numpy - as arrays of outer joins

I am trying to combine these three arrays into one below. Basically equivalent to an external SQL join (where the "pos" field is the key / index)

a1 = array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a2 = array([('3:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('3:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('3:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a3 = array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col3', '<f8'), ('col4', '<f8')])

Desired Result:

array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695, 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605, 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001, 1.9849650921836601e-31, 0.99999999997999001),
       ('3:6506', 4.6725971801473496e-25, 0.99999999995088695, NaN, NaN),
       ('3:6601', 2.2452745388799898e-27, 0.99999999995270605, NaN, NaN),
       ('3:21801', 1.9849650921836601e-31, 0.99999999997999001, NaN, NaN),
        ], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8'), ('col3', '<f8'), ('col4', '<f8')])

I think this answer may be on the right track, I just can't figure out how to apply it.

Update:

I tried to run unutbu response, but I get this error:

Traceback (most recent call last):
  File "fail2.py", line 21, in <module>
    a4 = recfunctions.join_by('pos', a4, a, jointype='outer')
  File "/usr/local/msg/lib/python2.6/site-packages/numpy/lib/recfunctions.py", line 973, in join_by
    current = output[f]
  File "/usr/local/msg/lib/python2.6/site-packages/numpy/ma/core.py", line 2943, in __getitem__
    dout = ndarray.__getitem__(_data, indx)
ValueError: field named col12 not found.

Update 2

I got this error only on numpy 1.5.1. I went up to 1.8.1 and he left.

+4
source share
1 answer
import numpy as np
import numpy.lib.recfunctions as recfunctions

a1 = np.array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a2 = np.array([('3:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('3:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('3:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a3 = np.array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col3', '<f8'), ('col4', '<f8')])

result = a1
for a in (a2, a3):
    cols = list(set(result.dtype.names).intersection(a.dtype.names))
    result = recfunctions.join_by(cols, result, a, jointype='outer')
print(result)

gives

[ ('2:21801', 1.98496509218366e-31, 0.99999999997999, 1.98496509218366e-31, 0.99999999997999)
 ('2:6506', 4.67259718014735e-25, 0.999999999950887, 4.67259718014735e-25, 0.999999999950887)
 ('2:6601', 2.24527453887999e-27, 0.999999999952706, 2.24527453887999e-27, 0.999999999952706)
 ('3:21801', 1.98496509218366e-31, 0.99999999997999, --, --)
 ('3:6506', 4.67259718014735e-25, 0.999999999950887, --, --)
 ('3:6601', 2.24527453887999e-27, 0.999999999952706, --, --)]

SQL- NumPy, Pandas. Pandas NumPy :

import numpy as np
import pandas as pd
a1 = np.array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a2 = np.array([('3:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('3:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('3:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col1', '<f8'), ('col2', '<f8')])

a3 = np.array([('2:6506', 4.6725971801473496e-25, 0.99999999995088695),
       ('2:6601', 2.2452745388799898e-27, 0.99999999995270605),
       ('2:21801', 1.9849650921836601e-31, 0.99999999997999001),], 
      dtype=[('pos', '|S100'), ('col3', '<f8'), ('col4', '<f8')])

dfs = [pd.DataFrame.from_records(a) for a in (a1, a2, a3)]

result = dfs[0]
for df in dfs[1:]:
    cols = list(set(result.columns).intersection(df.columns))
    result = pd.merge(result, df, how='outer', left_on=cols, right_on=cols)

print(result)

       pos          col1  col2          col3  col4
0   2:6506  4.672597e-25     1  4.672597e-25     1
1   2:6601  2.245275e-27     1  2.245275e-27     1
2  2:21801  1.984965e-31     1  1.984965e-31     1
3   3:6506  4.672597e-25     1           NaN   NaN
4   3:6601  2.245275e-27     1           NaN   NaN
5  3:21801  1.984965e-31     1           NaN   NaN

[6 rows x 5 columns]

Pandas , NumPy. , Pandas , , NaN - , ad hoc- NumPy .

, Pandas DataFrames .values, NumPy .to_records, . , , Dataframe.from_records, DataFrames. , DataFrames NumPy, .

, , - Pandas, , , .

+6

Source: https://habr.com/ru/post/1539471/


All Articles