This function returns array of int:
from pyspark.sql import functions as F
import pandas as pd
@F.pandas_udf('array<int>')
def pudf(x: pd.Series, y: pd.Series) -> pd.Series:
return pd.Series([[x, y]])
df = spark.createDataFrame([(5, 2), (6, 7)])
df = df.withColumn('out', pudf('_1', '_2'))
df.show()
# --- --- ------
# | _1| _2| out|
# --- --- ------
# | 5| 2|[5, 2]|
# | 6| 7|[6, 7]|
# --- --- ------
df.printSchema()
# root
# |-- _1: long (nullable = true)
# |-- _2: long (nullable = true)
# |-- out: array (nullable = true)
# | |-- element: integer (containsNull = true)
Question. How to return array or string?
If I change int
to string
and df elements to string, it fails to return the expected array of strings.
from pyspark.sql import functions as F
import pandas as pd
@F.pandas_udf('array<string>')
def pudf(x: pd.Series, y: pd.Series) -> pd.Series:
return pd.Series([[x, y]])
df = spark.createDataFrame([('5', '2'), ('6', '7')])
df = df.withColumn('out', pudf('_1', '_2'))
df.show()
PythonException: An exception was thrown from the Python worker. Please see the stack trace below. Traceback (most recent call last): File "pyarrow/array.pxi", line 913, in pyarrow.lib.Array.from_pandas File "pyarrow/array.pxi", line 311, in pyarrow.lib.array File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array File "pyarrow/error.pxi", line 122, in pyarrow.lib.check_status pyarrow.lib.ArrowTypeError: Expected bytes, got a 'Series' object
CodePudding user response:
from pyspark.sql import functions as F
import pandas as pd
@F.pandas_udf('array<string>')
def pudf(x: pd.Series, y: pd.Series) -> pd.Series:
return pd.Series([[x[0],y[0]]])
df = spark.createDataFrame([('5', '2'), ('6', '7')])
df = df.withColumn('out', pudf('_1', '_2'))
df.show(truncate=False)
df.printSchema()
# --- --- ------
# |_1 |_2 |out |
# --- --- ------
# |5 |2 |[5, 2]|
# |6 |7 |[6, 7]|
# --- --- ------
# root
# |-- _1: string (nullable = true)
# |-- _2: string (nullable = true)
# |-- out: array (nullable = true)
# | |-- element: string (containsNull = true)