代码之家  ›  专栏  ›  技术社区  ›  MyopicVisage

连接两个modin.pandas.DataFrame

  •  0
  • MyopicVisage  · 技术社区  · 6 年前

    我尝试加入/合并/合并两个modin.DataFrames,但失败了。有人成功地做了这个手术吗?这就是大数据modin项目的实施。

    来源如下: https://github.com/modin-project/modin/blob/master/modin/pandas/dataframe.py https://github.com/modin-project/modin/blob/master/modin/pandas/concat.py

    示例:

    import modin.pandas as pd
    
    vals = pd.DataFrame([1,2,3,4], index=['2018-01-01','2018-01-02','2018-01-03','2018-01-04'], columns=['Col1']); # print(vals) # 
    table = pd.DataFrame([5,6,7,8], index=['2018-01-01','2018-01-02','2018-01-03','2018-01-04'], columns=['Col2']); # print(table)
    

    第一次尝试:modin.pandas.DataFrame.join

    result = table.join(other=vals, on=None, how='outer', sort=False); # Modin: Doesn't work
    
    print(result)
    

    Suppressing duplicate error message.
    Suppressing duplicate error message.
    Suppressing duplicate error message.
    
    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    ~/anaconda3/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
        700                 type_pprinters=self.type_printers,
        701                 deferred_pprinters=self.deferred_printers)
    --> 702             printer.pretty(obj)
        703             printer.flush()
        704             return stream.getvalue()
    
    ~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in pretty(self, obj)
        393                             if callable(meth):
        394                                 return meth(obj, self, cycle)
    --> 395             return _default_pprint(obj, self, cycle)
        396         finally:
        397             self.end_group()
    
    ~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _default_pprint(obj, p, cycle)
        508     if _safe_getattr(klass, '__repr__', None) is not object.__repr__:
        509         # A user-provided repr. Find newlines and replace them with p.break_()
    --> 510         _repr_pprint(obj, p, cycle)
        511         return
        512     p.begin_group(1, '<')
    
    ~/anaconda3/lib/python3.6/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
        699     """A pprint that just redirects to the normal repr function."""
        700     # Find newlines and replace them with p.break_()
    --> 701     output = repr(obj)
        702     for idx,output_line in enumerate(output.splitlines()):
        703         if idx:
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in __repr__(self)
        454         if len(self._row_metadata) <= 60 and \
        455            len(self._col_metadata) <= 20:
    --> 456             return repr(self._repr_pandas_builder())
        457         # The split here is so that we don't repr pandas row lengths.
        458         result = self._repr_pandas_builder()
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_pandas_builder(self)
        382         # If we don't exceed the maximum number of values on either dimension
        383         if len(self.index) <= 60 and len(self.columns) <= 20:
    --> 384             return to_pandas(self)
        385 
        386         if len(self.index) >= 60:
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in to_pandas(df)
        259     """
        260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
    --> 261     pandas_df.index = df.index
        262     pandas_df.columns = df.columns
        263     return pandas_df
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
       3625         try:
       3626             object.__getattribute__(self, name)
    -> 3627             return object.__setattr__(self, name, value)
       3628         except AttributeError:
       3629             pass
    
    pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
        557 
        558     def _set_axis(self, axis, labels):
    --> 559         self._data.set_axis(axis, labels)
        560         self._clear_item_cache()
        561 
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels)
       3072             raise ValueError('Length mismatch: Expected axis has %d elements, '
       3073                              'new values have %d elements' %
    -> 3074                              (old_len, new_len))
       3075 
       3076         self.axes[axis] = new_labels
    
    ValueError: Length mismatch: Expected axis has 8 elements, new values have 4 elements
    
    ---------------------------------------------------------------------------
    ValueError                                Traceback (most recent call last)
    ~/anaconda3/lib/python3.6/site-packages/IPython/core/formatters.py in __call__(self, obj)
        343             method = get_real_method(obj, self.print_method)
        344             if method is not None:
    --> 345                 return method()
        346             return None
        347         else:
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_html_(self)
        473         if len(self._row_metadata) <= 60 and \
        474            len(self._col_metadata) <= 20:
    --> 475             return self._repr_pandas_builder()._repr_html_()
        476         # We split so that we insert our correct dataframe dimensions.
        477         result = self._repr_pandas_builder()._repr_html_()
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in _repr_pandas_builder(self)
        382         # If we don't exceed the maximum number of values on either dimension
        383         if len(self.index) <= 60 and len(self.columns) <= 20:
    --> 384             return to_pandas(self)
        385 
        386         if len(self.index) >= 60:
    
    ~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in to_pandas(df)
        259     """
        260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
    --> 261     pandas_df.index = df.index
        262     pandas_df.columns = df.columns
        263     return pandas_df
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
       3625         try:
       3626             object.__getattribute__(self, name)
    -> 3627             return object.__setattr__(self, name, value)
       3628         except AttributeError:
       3629             pass
    
    pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
        557 
        558     def _set_axis(self, axis, labels):
    --> 559         self._data.set_axis(axis, labels)
        560         self._clear_item_cache()
        561 
    
    ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in set_axis(self, axis, new_labels)
       3072             raise ValueError('Length mismatch: Expected axis has %d elements, '
       3073                              'new values have %d elements' %
    -> 3074                              (old_len, new_len))
       3075 
       3076         self.axes[axis] = new_labels
    
    ValueError: Length mismatch: Expected axis has 8 elements, new values have 4 elements
    

    第二次尝试:modin.pandas.concat

    result = pd.concat([table, vals], axis=1,); # Modin: Doesn't work
    print(result)
    

    相应的错误消息:

    > Suppressing duplicate error message. Suppressing duplicate error
    > message. Suppressing duplicate error message.
    > 
    > --------------------------------------------------------------------------- ValueError                                Traceback (most recent call
    > last) <ipython-input-3-4bf001fd75fb> in <module>()
    >       2 result = pd.concat([table, vals], axis=1,); # Modin: Doesn't work
    >       3 
    > ----> 4 print(result)
    > 
    > ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
    > __str__(self)
    >     229 
    >     230     def __str__(self):
    > --> 231         return repr(self)
    >     232 
    >     233     def _repr_pandas_builder(self):
    > 
    > ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
    > __repr__(self)
    >     454         if len(self._row_metadata) <= 60 and \
    >     455            len(self._col_metadata) <= 20:
    > --> 456             return repr(self._repr_pandas_builder())
    >     457         # The split here is so that we don't repr pandas row lengths.
    >     458         result = self._repr_pandas_builder()
    > 
    > ~/anaconda3/lib/python3.6/site-packages/modin/pandas/dataframe.py in
    > _repr_pandas_builder(self)
    >     382         # If we don't exceed the maximum number of values on either dimension
    >     383         if len(self.index) <= 60 and len(self.columns) <= 20:
    > --> 384             return to_pandas(self)
    >     385 
    >     386         if len(self.index) >= 60:
    > 
    > ~/anaconda3/lib/python3.6/site-packages/modin/pandas/utils.py in
    > to_pandas(df)
    >     259     """
    >     260     pandas_df = pandas.concat(ray.get(df._row_partitions), copy=False)
    > --> 261     pandas_df.index = df.index
    >     262     pandas_df.columns = df.columns
    >     263     return pandas_df
    > 
    > ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
    > __setattr__(self, name, value)    3625         try:    3626             object.__getattribute__(self, name)
    > -> 3627             return object.__setattr__(self, name, value)    3628         except AttributeError:    3629             pass
    > 
    > pandas/_libs/properties.pyx in
    > pandas._libs.properties.AxisProperty.__set__()
    > 
    > ~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in
    > _set_axis(self, axis, labels)
    >     557 
    >     558     def _set_axis(self, axis, labels):
    > --> 559         self._data.set_axis(axis, labels)
    >     560         self._clear_item_cache()
    >     561 
    > 
    > ~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in
    > set_axis(self, axis, new_labels)    3072             raise
    > ValueError('Length mismatch: Expected axis has %d elements, '    3073 
    > 'new values have %d elements' %
    > -> 3074                              (old_len, new_len))    3075     3076         self.axes[axis] = new_labels
    > 
    > ValueError: Length mismatch: Expected axis has 8 elements, new values
    > have 4 elements
    
    1 回复  |  直到 5 年前
        1
  •  2
  •   Devin    6 年前

    此问题已通过最近对后端的重写修复。此前,莫丁一直在为极小的分区而挣扎。修复程序还没有发布。如果您现在想尝试,请按以下方式安装Modin(从当前主机):

    pip install git+https://github.com/modin-project/modin