I want to do the equivalent of the following in pandas without using spark.
This is what I do in spark to generate some random data using class UsedFunctions (not the main point).
class UsedFunctions:
def randomString(self,length):
letters = string.ascii_letters
result_str = ''.join(random.choice(letters) for i in range(length))
return result_str
def clustered(self,x,numRows):
return math.floor(x -1)/numRows
def scattered(self,x,numRows):
return abs((x -1 % numRows))* 1.0
def randomised(self,seed,numRows):
random.seed(seed)
return abs(random.randint(0, numRows) % numRows) * 1.0
def padString(self,x,chars,length):
n = int(math.log10(x) 1)
result_str = ''.join(random.choice(chars) for i in range(length-n)) str(x)
return result_str
def padSingleChar(self,chars,length):
result_str = ''.join(chars for i in range(length))
return result_str
def println(self,lst):
for ll in lst:
print(ll[0])
usedFunctions = UsedFunctions()
spark = SparkSession.builder \
.enableHiveSupport() \
.getOrCreate()
sc = SparkContext.getOrCreate()
numRows = 10
start = 1
end = start 9
print ("starting at ID = ",start, ",ending on = ",end)
Range = range(start, end)
rdd = sc.parallelize(Range). \
map(lambda x: (x, usedFunctions.clustered(x,numRows), \
usedFunctions.scattered(x,numRows), \
usedFunctions.randomised(x,numRows), \
usedFunctions.randomString(50), \
usedFunctions.padString(x," ",50), \
usedFunctions.padSingleChar("x",4000)))
df = rdd.toDF()
OK how can I create a panda DataFrame df without using Spark? I know the following spark dataframe to pandas conversion will work but using Spark is not an option here.
p_dfm = df.toPandas() # converting spark DF to Pandas DF
Thanks
CodePudding user response:
I tried to retain most of your code and syntax from spark.
# your class and functions on top as is ...
usedFunctions = UsedFunctions()
numRows = 10
start = 1
end = start 9
print ("starting at ID = ",start, ",ending on = ",end)
Range = range(start, end)
df =pd.DataFrame(map(lambda x: (x, usedFunctions.clustered(x,numRows), \
usedFunctions.scattered(x,numRows), \
usedFunctions.randomised(x,numRows), \
usedFunctions.randomString(50), \
usedFunctions.padString(x," ",50), \
usedFunctions.padSingleChar("x",4000)), Range))
Output:
0 1 2 3 4 5 6
0 1 0.0 0.0 2.0 KZWeqhFWCEPyYngFbyBM... ... xxxxxxxxxxxxxxx...
1 2 0.1 1.0 0.0 ffxkVZQtqMnMcLRkBOzZ... ... xxxxxxxxxxxxxxx...
2 3 0.2 2.0 3.0 LIixMEOLeMaEqJomTEIJ... ... xxxxxxxxxxxxxxx...
3 4 0.3 3.0 3.0 tgUzEjfebzJsZWdoHIxr... ... xxxxxxxxxxxxxxx...
4 5 0.4 4.0 9.0 qVwYSVPHbDXpPdkhxEpy... ... xxxxxxxxxxxxxxx...
5 6 0.5 5.0 9.0 fFWqcajQLEWVxuXbrFZm... ... xxxxxxxxxxxxxxx...
6 7 0.6 6.0 5.0 jzPdeIgxLdGncfBAepfJ... ... xxxxxxxxxxxxxxx...
7 8 0.7 7.0 3.0 xyimTcfipZGnzPbDFDyF... ... xxxxxxxxxxxxxxx...
8 9 0.8 8.0 7.0 NxrilRavGDMfvJNScUyk... ... xxxxxxxxxxxxxxx...