我無法將一個系列作為新列分配給考拉資料框。以下是我正在使用的代碼庫:
from databricks import koalas
dft=koalas.DataFrame({'a':[1,2,3],'b':[3,4,5]})
dft.assign(c=koalas.Series([1,2,3]))
輸出:
AnalysisException Traceback (most recent call last)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in __repr__(self)
11661 return self._to_internal_pandas().to_string()
11662
> 11663 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
11664 pdf_length = len(pdf)
11665 pdf = pdf.iloc[:max_display_count]
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
11652 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
11653 object.__setattr__(
> 11654 self, "_repr_pandas_cache", {n: self.head(n 1)._to_internal_pandas()}
11655 )
11656 return self._repr_pandas_cache[n]
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in head(self, n)
5748 return DataFrame(self._internal.with_filter(F.lit(False)))
5749 else:
-> 5750 sdf = self._internal.resolved_copy.spark_frame
5751 if get_option("compute.ordered_head"):
5752 sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
576 def wrapped_lazy_property(self):
577 if not hasattr(self, attr_name):
--> 578 setattr(self, attr_name, fn(self))
579 return getattr(self, attr_name)
580
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/internal.py in resolved_copy(self)
1066 def resolved_copy(self) -> "InternalFrame":
1067 """ Copy the immutable InternalFrame with the updates resolved. """
-> 1068 sdf = self.spark_frame.select(self.spark_columns list(HIDDEN_COLUMNS))
1069 return self.copy(
1070 spark_frame=sdf,
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/sql/dataframe.py in select(self, *cols)
1683 [Row(name='Alice', age=12), Row(name='Bob', age=15)]
1684 """
-> 1685 jdf = self._jdf.select(self._jcols(*cols))
1686 return DataFrame(jdf, self.sql_ctx)
1687
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/py4j/java_gateway.py in __call__(self, *args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
AnalysisException: Resolved attribute(s) 0#991184L missing from __index_level_0__#991164L,a#991165L,b#991166L,__natural_order__#991170L in operator !Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L].;
!Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L]
- Project [__index_level_0__#991164L, a#991165L, b#991166L, monotonically_increasing_id() AS __natural_order__#991170L]
- LogicalRDD [__index_level_0__#991164L, a#991165L, b#991166L], false
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in _repr_html_(self)
11684 return self._to_internal_pandas().to_html(notebook=True, bold_rows=bold_rows)
11685
> 11686 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
11687 pdf_length = len(pdf)
11688 pdf = pdf.iloc[:max_display_count]
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
11652 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
11653 object.__setattr__(
> 11654 self, "_repr_pandas_cache", {n: self.head(n 1)._to_internal_pandas()}
11655 )
11656 return self._repr_pandas_cache[n]
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/frame.py in head(self, n)
5748 return DataFrame(self._internal.with_filter(F.lit(False)))
5749 else:
-> 5750 sdf = self._internal.resolved_copy.spark_frame
5751 if get_option("compute.ordered_head"):
5752 sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
576 def wrapped_lazy_property(self):
577 if not hasattr(self, attr_name):
--> 578 setattr(self, attr_name, fn(self))
579 return getattr(self, attr_name)
580
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/databricks/koalas/internal.py in resolved_copy(self)
1066 def resolved_copy(self) -> "InternalFrame":
1067 """ Copy the immutable InternalFrame with the updates resolved. """
-> 1068 sdf = self.spark_frame.select(self.spark_columns list(HIDDEN_COLUMNS))
1069 return self.copy(
1070 spark_frame=sdf,
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/sql/dataframe.py in select(self, *cols)
1683 [Row(name='Alice', age=12), Row(name='Bob', age=15)]
1684 """
-> 1685 jdf = self._jdf.select(self._jcols(*cols))
1686 return DataFrame(jdf, self.sql_ctx)
1687
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/py4j/java_gateway.py in __call__(self, *args)
1307
1308 answer = self.gateway_client.send_command(command)
-> 1309 return_value = get_return_value(
1310 answer, self.gateway_client, self.target_id, self.name)
1311
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
AnalysisException: Resolved attribute(s) 0#991184L missing from __index_level_0__#991164L,a#991165L,b#991166L,__natural_order__#991170L in operator !Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L].;
!Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L]
- Project [__index_level_0__#991164L, a#991165L, b#991166L, monotonically_increasing_id() AS __natural_order__#991170L]
- LogicalRDD [__index_level_0__#991164L, a#991165L, b#991166L], false
你能幫我理解我的方法出了什么問題以及如何為考拉資料劇分配一個新列嗎?
uj5u.com熱心網友回復:
不幸的是,您只能在assign方法中對資料框的現有列使用運算式。
解釋
錯誤堆疊中的重要部分是 spark 執行計劃:
AnalysisException: Resolved attribute(s) 0#991184L missing from __index_level_0__#991164L,a#991165L,b#991166L,__natural_order__#991170L in operator !Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L].;
!Project [__index_level_0__#991164L, a#991165L, b#991166L, 0#991184L AS c#991191L, __natural_order__#991170L]
- Project [__index_level_0__#991164L, a#991165L, b#991166L, monotonically_increasing_id() AS __natural_order__#991170L]
- LogicalRDD [__index_level_0__#991164L, a#991165L, b#991166L], false
在spark執行計劃中,Project可以翻譯成SQL的SELECT. 并且您可以看到執行計劃在第二次失敗Project(您從下到上閱讀 spark 執行計劃),因為它在您的資料框中 存在的列中找不到列0#991184L(即您要添加到dft資料框中的系列),, ,dft__index_level_0__#991164La#991165Lb#991166L__natural_order__#991170L
事實上,列0#991184L來自您突然創建的系列,而不是源自您的dft資料框的系列。對于 Spark,這意味著該列來自另一個資料幀,因此您顯然無法使用dfts從資料幀中檢索它SELECT,這正是 Spark 試圖做的。
要鏈接熊貓和星火的API,火花相當于assign將withColumn星火資料幀的方法,其檔案狀態:
列運算式必須是此 DataFrame 上的運算式;嘗試從其他某個 DataFrame 添加列將引發錯誤。
注意:實際上,Spark 等效于assign的select功能不止于此withColumn,僅限于僅添加一列,但withColumn也適用于select
因此assign適用于以下情況:
dft=koalas.DataFrame({'a':[1,2,3],'b':[3,4,5]})
# Type of dft['a'] and dft['b'] is Serie
dft.assign(c=dft['a']))
dft.assign(d=dft['a']*2))
dft.assign(e=dft['a']*dft['b']))
但不適用于以下情況:
dft=koalas.DataFrame({'a':[1,2,3],'b':[3,4,5]})
dft.assign(c=koalas.Series([1,2,3]))
dft2=pd.DataFrame({'d': [1, 2, 3]})
# Type of dft2['d'] is Serie
dft.assign(d=dft2['d'])
解決方法
這里的解決方法是按照ric-s 的回答中的說明進行操作,并使用分配列dft['c'] = koalas.Series([1,2,3])
在這里它起作用,因為在這種情況下,Spark 將連接兩個資料幀,而不是僅僅從第一個資料幀中選擇列。作為連接,這里被考拉 API 隱藏,在 Spark 中可能是非常昂貴的操作,你有一個護欄,你需要通過設定compute.ops_on_diff_frames來覆寫True
設定compute.ops_on_diff_frames為 true 只是告訴考拉“我承認這個操作是一個連接,可能會導致性能不佳”。您實際上可以在執行操作后將此選項重置為其先前的值,使用koalas.reset_option('compute.ops_on_diff_frames')
uj5u.com熱心網友回復:
老實說,我不知道為什么會出現該錯誤assign,但是將新列添加到 a 的一種方法koalas.DataFrame是使用['']如下所示的標準分配方法。
更改選項compute.ops_on_diff_frames以允許對不同的系列/資料幀進行操作很重要。
import databricks.koalas as ks
ks.set_option('compute.ops_on_diff_frames', True)
dft = ks.DataFrame({'a':[1,2,3],'b':[3,4,5]})
dft['c'] = koalas.Series([1,2,3])
dft
# a b c
# 0 1 3 1
# 1 2 4 2
# 2 3 5 3
轉載請註明出處,本文鏈接:https://www.uj5u.com/yidong/383930.html
上一篇:將一列考拉串列拆分為多列
