Can you please tell me how to convert the url column to output column rows in a Dataframe using pyspark.
Transformation rule :
- replace https to abfss
- replace "blob.core.windows.net" to dfs.core.windows.net
- Extract text between 3rd '/' and last '/', '@' text between 2nd '/' and '.' remaining string.
from pyspark.sql import Row
lst = [Row(url='https://inputfile.blob.core.windows.net/inputstorage/AvailabilityZones_1.csv', \
output='abfss://[email protected]/AvailabilityZones_1.csv'), \
Row(url='https://inputfile.blob.core.windows.net/inputstorage/AvailabilityZones_2.csv', \
output='abfss://[email protected]/AvailabilityZones_2.csv'), \
Row(url='https://inputfile.blob.core.windows.net/inputstorage/newfolder/AvailabilityZones_3.csv', \
output='abfss://inputstorage/[email protected]/AvailabilityZones_3.csv')]
df= spark.createDataFrame(lst)
expected dataframe as follows : expected output
CodePudding user response:
Since no one answered, I am answering myself.
from pyspark.sql.functions import col, udf
def parseurl(url):
fin_url=""
url = url.replace('https://', 'abfss://')
url = url.replace('blob.core.windows.net', 'dfs.core.windows.net')
arr = url.split('/')
sub_arr = []
for pos in range(4,len(arr)-1):
sub_arr.append(arr[pos])
subFolder = ""
if len(sub_arr)>0:
subFolder = "/".join(str(x) for x in sub_arr)
if subFolder != "":
fin_url = url[:8] arr[3] '/' subFolder '@' arr[2] '/' arr[-1]
else:
fin_url = url[:8] arr[3] '/' subFolder '@' arr[2] '/' arr[-1]
return fin_url
urlUDF = udf(lambda z: parseurl(z))
df.select(col("url"), \
urlUDF(col("url")).alias("fin_url") ) \
.show(truncate=False)