classA = Dataset(id = ["id1", "id2", "id3", "id4", "id5"],
mark = [50, 69.5, 45.5, 88.0, 98.5]);
grades = Dataset(mark = [0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5],
grade = ["F", "P", "C", "B", "A-", "A", "A "]);
we can use InMemorydatasets package to do closejoin.
How can we do this method in DataFrames package.
closejoin(classA, grades, on = :mark)
closejoin(classA, grades, on = :mark, direction=:forward, border=:nearest)
and how to do this in R?
CodePudding user response:
In R it could be done by using findInterval
.
classA = data.frame(id = c("id1", "id2", "id3", "id4", "id5"),
mark = c(50, 69.5, 45.5, 88.0, 98.5))
grades = data.frame(mark = c(0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5),
grade = c("F", "P", "C", "B", "A-", "A", "A "))
cbind(classA, grade = grades$grade[findInterval(classA$mark, grades$mark)])
# id mark grade
#1 id1 50.0 P
#2 id2 69.5 B
#3 id3 45.5 F
#4 id4 88.0 A-
#5 id5 98.5 A
cbind(classA, grade = grades$grade[findInterval(classA$mark, c(-Inf, grades$mark), all.inside = TRUE, left.open = TRUE)])
id mark grade
#1 id1 50.0 C
#2 id2 69.5 B
#3 id3 45.5 P
#4 id4 88.0 A
#5 id5 98.5 A
In Julia you can use searchsortedlast
and searchsortedfirst
.
using DataFrames
classA = DataFrame(id = ["id1", "id2", "id3", "id4", "id5"],
mark = [50, 69.5, 45.5, 88.0, 98.5]);
grades = DataFrame(mark = [0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5],
grade = ["F", "P", "C", "B", "A-", "A", "A "]);
classA[!, "Grade"] = grades.grade[[searchsortedlast(grades.mark, x) for x in classA.mark]]
classA
#5×3 DataFrame
# Row │ id mark Grade
# │ String Float64 String
#─────┼─────────────────────────
# 1 │ id1 50.0 P
# 2 │ id2 69.5 B
# 3 │ id3 45.5 F
# 4 │ id4 88.0 A-
# 5 │ id5 98.5 A
classA[!, "Grade"] = grades.grade[min.(length(grades.grade), [searchsortedfirst(grades.mark, x) for x in classA.mark])]
classA
#5×3 DataFrame
# Row │ id mark Grade
# │ String Float64 String
#─────┼─────────────────────────
# 1 │ id1 50.0 C
# 2 │ id2 69.5 B
# 3 │ id3 45.5 P
# 4 │ id4 88.0 A
# 5 │ id5 98.5 A
The same with InMemoryDatasets
in Julia as given in the question including results for comparison.
using InMemoryDatasets
classA = Dataset(id = ["id1", "id2", "id3", "id4", "id5"],
mark = [50, 69.5, 45.5, 88.0, 98.5]);
grades = Dataset(mark = [0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5],
grade = ["F", "P", "C", "B", "A-", "A", "A "]);
closejoin(classA, grades, on = :mark)
#5×3 Dataset
# Row │ id mark grade
# │ identity identity identity
# │ String? Float64? String?
#─────┼──────────────────────────────
# 1 │ id1 50.0 P
# 2 │ id2 69.5 B
# 3 │ id3 45.5 F
# 4 │ id4 88.0 A-
# 5 │ id5 98.5 A
closejoin(classA, grades, on = :mark, direction=:forward, border=:nearest)
#5×3 Dataset
# Row │ id mark grade
# │ identity identity identity
# │ String? Float64? String?
#─────┼──────────────────────────────
# 1 │ id1 50.0 C
# 2 │ id2 69.5 B
# 3 │ id3 45.5 P
# 4 │ id4 88.0 A
# 5 │ id5 98.5 A
CodePudding user response:
Don't think you can do this with the DataFrames package. One potential solution is to use RCall and data.table, e.g. in R:
library(data.table)
classA <- structure(list(id = c("id1", "id2", "id3", "id4", "id5"),
mark = c(50, 69.5, 45.5, 88, 98.5)),
class = c("data.table", "data.frame"), row.names = c(NA, -5L))
classB <- structure(list(mark = c(0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5),
grade = c("F", "P", "C", "B", "A-", "A", "A ")),
class = c("data.table", "data.frame"), row.names = c(NA, -7L))
setkey(classA, mark)
setkey(classB, mark)
classB[classA, roll= Inf][order(id)]
#> mark grade id
#> 1: 50.0 P id1
#> 2: 69.5 B id2
#> 3: 45.5 F id3
#> 4: 88.0 A- id4
#> 5: 98.5 A id5
Created on 2022-09-19 by the reprex package (v2.0.1)
CodePudding user response:
As InMemoryDatasets
is a Julia package, and composability is a Julia highlight, getting DataFrames
to do closejoin
can be done with some connecting code. Admittedly this is not super efficient, but InMemoryDatasets
code works, and reusing it saves some development time.
Specifically, the following:
using DataFrames, InMemoryDatasets
import Base: pairs
pairs(d::DataFrame) = zip(Symbol.(names(d)), eachcol(d))
pairs(d::Dataset) = zip(Symbol.(names(d)), eachcol(d))
function df_closejoin(f1::DataFrame, f2::DataFrame; kwargs...)
f1ds = Dataset(Dict(pairs(f1)))
f2ds = Dataset(Dict(pairs(f2)))
resds = InMemoryDatasets.closejoin(f1ds, f2ds; kwargs...)
return DataFrame(Dict(pairs(resds)))
end
Allows:
julia> df_closejoin(classA, grades, on = :mark)
5×3 DataFrame
Row │ grade id mark
│ String? String? Float64?
─────┼────────────────────────────
1 │ P id1 50.0
2 │ B id2 69.5
3 │ F id3 45.5
4 │ A- id4 88.0
5 │ A id5 98.5
julia> df_closejoin(classA, grades, on = :mark,
direction=:forward, border=:missing)
5×3 DataFrame
Row │ grade id mark
│ String? String? Float64?
─────┼────────────────────────────
1 │ C id1 50.0
2 │ B id2 69.5
3 │ P id3 45.5
4 │ A id4 88.0
5 │ missing id5 98.5
When both classA
and grades
are now DataFrames defined by:
classA = DataFrame(id = ["id1", "id2", "id3", "id4", "id5"],
mark = [50, 69.5, 45.5, 88.0, 98.5]);
grades = DataFrame(mark = [0, 49.5, 59.5, 69.5, 79.5, 89.5, 95.5],
grade = ["F", "P", "C", "B", "A-", "A", "A "]);