Is there a Numpy or Pandas parameter to issue a warning when creating a NaN value

I spend a lot of time working with Pandas, which uses numpy arrays to store numbers.

In my use case, there should never be any NaN values ​​- they indicate that something went wrong (usually the Pandas screw associated with this, such as incorrectly connected data, poorly loaded data, etc.)

It would be useful if Pandas or Numpy had a parameter that immediately issued a warning if the NaN value appeared in any series in the data frame. (This question is not about replacing or imputing NaN. Just a warning.)

Yes, at each stage you can write a lot of local checks ( do this thing. Now check whether you created NaNs. Do this other thing. Check again whether you created NaNsetc.), but it is terribly verbose and inefficient. What I want to say to Pandas is this if you ever put a NaN value in a dataframe, immediately issue a warningone time, as a global setting at the top of my jupyter laptop.

Does anyone know if there is a global setting for this?

+4
source share
1 answer

If you just want to raise a warning, you can check if your framework contains any NaNwith df.isnull().values.any()tehn, you can use the module warningsto raise a warning.

Here is a working example:

>>> from StringIO import StringIO 
>>> import pandas as pd 
>>> st = """ 
... col1|col2
... 1|
... 2|3 
... """
>>> df = pd.read_csv(StringIO(st),sep="|") 
>>> df.head() 
   col1  col2
0     1   NaN
1     2     3
>>> import warnings                              ^
>>> if df.isnull().values.any(): 
...     warnings.warn("there is NaN")
... 
__main__:2: UserWarning: there is NaN
>>> 

pandas, , , DataFrame class , NaN. , ​​pandas , . , DataFrame.

def __init__(self, data=None, index=None, columns=None, dtype=None,
             copy=False):
    if data is None:
        data = {}
    if dtype is not None:
        dtype = self._validate_dtype(dtype)

    if isinstance(data, DataFrame):
        data = data._data

    if isinstance(data, BlockManager):
        mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
                             dtype=dtype, copy=copy)
    elif isinstance(data, dict):
        mgr = self._init_dict(data, index, columns, dtype=dtype)
    elif isinstance(data, ma.MaskedArray):
        import numpy.ma.mrecords as mrecords
        # masked recarray
        if isinstance(data, mrecords.MaskedRecords):
            mgr = _masked_rec_array_to_mgr(data, index, columns, dtype,
                                           copy)

        # a masked array
        else:
            mask = ma.getmaskarray(data)
            if mask.any():
                data, fill_value = maybe_upcast(data, copy=True)
                data[mask] = fill_value
            else:
                data = data.copy()
            mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                     copy=copy)

    elif isinstance(data, (np.ndarray, Series, Index)):
        if data.dtype.names:
            data_columns = list(data.dtype.names)
            data = dict((k, data[k]) for k in data_columns)
            if columns is None:
                columns = data_columns
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif getattr(data, 'name', None) is not None:
            mgr = self._init_dict({data.name: data}, index, columns,
                                  dtype=dtype)
        else:
            mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                     copy=copy)
    elif isinstance(data, (list, types.GeneratorType)):
        if isinstance(data, types.GeneratorType):
            data = list(data)
        if len(data) > 0:
            if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
                if is_named_tuple(data[0]) and columns is None:
                    columns = data[0]._fields
                arrays, columns = _to_arrays(data, columns, dtype=dtype)
                columns = _ensure_index(columns)

                # set the index
                if index is None:
                    if isinstance(data[0], Series):
                        index = _get_names_from_index(data)
                    elif isinstance(data[0], Categorical):
                        index = _default_index(len(data[0]))
                    else:
                        index = _default_index(len(data))

                mgr = _arrays_to_mgr(arrays, columns, index, columns,
                                     dtype=dtype)
            else:
                mgr = self._init_ndarray(data, index, columns, dtype=dtype,
                                         copy=copy)
        else:
            mgr = self._init_dict({}, index, columns, dtype=dtype)
    elif isinstance(data, collections.Iterator):
        raise TypeError("data argument can't be an iterator")
    else:
        try:
            arr = np.array(data, dtype=dtype, copy=copy)
        except (ValueError, TypeError) as e:
            exc = TypeError('DataFrame constructor called with '
                            'incompatible data and dtype: %s' % e)
            raise_with_traceback(exc)

        if arr.ndim == 0 and index is not None and columns is not None:
            if isinstance(data, compat.string_types) and dtype is None:
                dtype = np.object_
            if dtype is None:
                dtype, data = infer_dtype_from_scalar(data)

            values = np.empty((len(index), len(columns)), dtype=dtype)
            values.fill(data)
            mgr = self._init_ndarray(values, index, columns, dtype=dtype,
                                     copy=False)
        else:
            raise ValueError('DataFrame constructor not properly called!')

    NDFrame.__init__(self, mgr, fastpath=True)

, , pandas.

+2

Source: https://habr.com/ru/post/1682302/


All Articles