Is there a way to make a file with the correlation statistic between the raw number of fish observed ("num") and each environmental data column ("temp", "do", etc.) by species ("group")?
*As well as correlations between the means and medians of num vs. env. factors?
I'd also like to be able to choose which correlation method to use (Pearson correlation, Kendall rank correlation, Spearman correlation, etc.)
My data:
zeros <- structure(list(year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("2019", "2020"), class = "factor"), season = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("dry", "wet"), class = "factor"),
site = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 1L, 1L, 2L, 2L, 3L,
3L, 4L, 4L, 5L, 5L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L
), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
group = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L
), .Label = c("Hardhead silverside", "Sailfin molly"), class = "factor"),
num = c(0, 8, 0, 9, 0, 13, 0, 9, 0, 10, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 7, 0, 2,
0, 3, 0, 13, 0), temp = c(23L, 36L, 35L, 34L, 30L, 28L, 18L,
19L, 33L, 33L, 25L, 20L, 33L, 23L, 36L, 32L, 28L, 17L, 34L,
31L, 26L, 34L, 26L, 35L, 15L, 25L, 26L, 20L, 18L, 14L, 23L,
17L, 26L, 17L, 17L, 19L, 29L, 31L, 18L, 15L), sal = c(12.5,
25.5, 8.5, 15.5, 17.5, 27.5, 9.5, 31.5, 1.5, 34.5, 25.5,
21.5, 10.5, 8.5, 32.5, 19.5, 6.5, 5.5, 15.5, 28.5, 6.5, 3.5,
29.5, 13.5, 7.5, 16.5, 3.5, 28.5, 22.5, 5.5, 9.5, 12.5, 29.5,
24.5, 8.5, 32.5, 37.5, 3.5, 12.5, 19.5), do = c(9.66, 7.66,
1.66, 14.66, 15.66, 1.66, 14.66, 15.66, 0.66, 5.66, 10.66,
11.66, 4.66, 0.66, 13.66, 1.66, 13.66, 6.66, 6.66, 10.66,
9.66, 15.66, 9.66, 15.66, 4.66, 13.66, 1.66, 11.66, 6.66,
8.66, 12.66, 0.66, 6.66, 0.66, 9.66, 16.66, 1.66, 10.66,
15.66, 10.66), depth = c(120L, 161L, 52L, 52L, 43L, 105L,
165L, 23L, 79L, 136L, 41L, 59L, 65L, 118L, 122L, 69L, 137L,
88L, 152L, 105L, 108L, 79L, 96L, 80L, 22L, 110L, 157L, 118L,
126L, 93L, 156L, 64L, 74L, 24L, 111L, 113L, 157L, 78L, 121L,
130L)), class = "data.frame", row.names = c(NA, -40L))
CodePudding user response:
The first part of your question is straightforward:
zeros.spl <- split(zeros, zeros$group)
zeros.cors <- sapply(zeros.spl, function(x) cor(x[, "num"], x[, 6:9]))
dimnames(zeros.cors)[[1]] <- colnames(zeros)[6:9]
zeros.cors
# Hardhead silverside Sailfin molly
# temp -0.3080334 0.36174046
# sal 0.1393580 0.47095129
# do 0.2544695 -0.06646818
# depth 0.1296208 0.08777425
t(zeros.cors)
# temp sal do depth
# Hardhead silverside -0.3080334 0.1393580 0.25446948 0.12962078
# Sailfin molly 0.3617405 0.4709513 -0.06646818 0.08777425
Use write.csv(zeros.cors, file="results.csv")
or write.csv(t(zeros.cors), file="results.csv")
depending on what you want the rows/cols to be.
The second question is not clear. The means/medians of a group will be a single value so you cannot correlate it with the environmental variables. You could compute the means by group with aggregate:
aggregate(zeros[, 5:9], by=list(zeros$group), "mean")
# Group.1 num temp sal do depth
# 1 Hardhead silverside 1.45 25.95 15.35 8.51 105.20
# 2 Sailfin molly 2.45 25.00 18.90 9.06 90.25
aggregate(zeros[, 5:9], by=list(zeros$group), "median")
# Group.1 num temp sal do depth
# 1 Hardhead silverside 0 26 11.5 9.66 115.5
# 2 Sailfin molly 0 24 19.5 10.66 90.5