有可能行绑定两个没有相同列集的数据帧吗?我希望保留绑定后不匹配的列。
当前回答
data.table的替代方案:
library(data.table)
df1 = data.frame(a = c(1:5), b = c(6:10))
df2 = data.frame(a = c(11:15), b = c(16:20), c = LETTERS[1:5])
rbindlist(list(df1, df2), fill = TRUE)
Rbind也可以在数据中工作。表中只要对象都转换为数据。表对象,所以
rbind(setDT(df1), setDT(df2), fill=TRUE)
也适用于这种情况。当您有几个数据时,这可能更可取。表,而不想构造列表。
其他回答
rbind。从包装胶合板填充可能是你正在寻找的。
rbind.ordered=function(x,y){
diffCol = setdiff(colnames(x),colnames(y))
if (length(diffCol)>0){
cols=colnames(y)
for (i in 1:length(diffCol)) y=cbind(y,NA)
colnames(y)=c(cols,diffCol)
}
diffCol = setdiff(colnames(y),colnames(x))
if (length(diffCol)>0){
cols=colnames(x)
for (i in 1:length(diffCol)) x=cbind(x,NA)
colnames(x)=c(cols,diffCol)
}
return(rbind(x, y[, colnames(x)]))
}
您也可以使用sjmisc::add_rows(),它使用dplyr::bind_rows(),但与bind_rows()不同,add_rows()保留属性,因此对带标签的数据很有用。
请参阅以下带有标记数据集的示例。如果数据被标记,frq()函数打印带有值标签的频率表。
library(sjmisc)
library(dplyr)
data(efc)
# select two subsets, with some identical and else different columns
x1 <- efc %>% select(1:5) %>% slice(1:10)
x2 <- efc %>% select(3:7) %>% slice(11:20)
str(x1)
#> 'data.frame': 10 obs. of 5 variables:
#> $ c12hour : num 16 148 70 168 168 16 161 110 28 40
#> ..- attr(*, "label")= chr "average number of hours of care per week"
#> $ e15relat: num 2 2 1 1 2 2 1 4 2 2
#> ..- attr(*, "label")= chr "relationship to elder"
#> ..- attr(*, "labels")= Named num 1 2 3 4 5 6 7 8
#> .. ..- attr(*, "names")= chr "spouse/partner" "child" "sibling" "daughter or son -in-law" ...
#> $ e16sex : num 2 2 2 2 2 2 1 2 2 2
#> ..- attr(*, "label")= chr "elder's gender"
#> ..- attr(*, "labels")= Named num 1 2
#> .. ..- attr(*, "names")= chr "male" "female"
#> $ e17age : num 83 88 82 67 84 85 74 87 79 83
#> ..- attr(*, "label")= chr "elder' age"
#> $ e42dep : num 3 3 3 4 4 4 4 4 4 4
#> ..- attr(*, "label")= chr "elder's dependency"
#> ..- attr(*, "labels")= Named num 1 2 3 4
#> .. ..- attr(*, "names")= chr "independent" "slightly dependent" "moderately dependent" "severely dependent"
bind_rows(x1, x1) %>% frq(e42dep)
#>
#> # e42dep <numeric>
#> # total N=20 valid N=20 mean=3.70 sd=0.47
#>
#> val frq raw.prc valid.prc cum.prc
#> 3 6 30 30 30
#> 4 14 70 70 100
#> <NA> 0 0 NA NA
add_rows(x1, x1) %>% frq(e42dep)
#>
#> # elder's dependency (e42dep) <numeric>
#> # total N=20 valid N=20 mean=3.70 sd=0.47
#>
#> val label frq raw.prc valid.prc cum.prc
#> 1 independent 0 0 0 0
#> 2 slightly dependent 0 0 0 0
#> 3 moderately dependent 6 30 30 30
#> 4 severely dependent 14 70 70 100
#> NA NA 0 0 NA NA
我写了一个函数来做这件事,因为我喜欢我的代码告诉我什么是错误的。这个函数将显式地告诉您哪些列名不匹配,以及是否存在类型不匹配。然后它会尽最大努力组合data.frames。限制是一次只能合并两个data.frame。
### combines data frames (like rbind) but by matching column names
# columns without matches in the other data frame are still combined
# but with NA in the rows corresponding to the data frame without
# the variable
# A warning is issued if there is a type mismatch between columns of
# the same name and an attempt is made to combine the columns
combineByName <- function(A,B) {
a.names <- names(A)
b.names <- names(B)
all.names <- union(a.names,b.names)
print(paste("Number of columns:",length(all.names)))
a.type <- NULL
for (i in 1:ncol(A)) {
a.type[i] <- typeof(A[,i])
}
b.type <- NULL
for (i in 1:ncol(B)) {
b.type[i] <- typeof(B[,i])
}
a_b.names <- names(A)[!names(A)%in%names(B)]
b_a.names <- names(B)[!names(B)%in%names(A)]
if (length(a_b.names)>0 | length(b_a.names)>0){
print("Columns in data frame A but not in data frame B:")
print(a_b.names)
print("Columns in data frame B but not in data frame A:")
print(b_a.names)
} else if(a.names==b.names & a.type==b.type){
C <- rbind(A,B)
return(C)
}
C <- list()
for(i in 1:length(all.names)) {
l.a <- all.names[i]%in%a.names
pos.a <- match(all.names[i],a.names)
typ.a <- a.type[pos.a]
l.b <- all.names[i]%in%b.names
pos.b <- match(all.names[i],b.names)
typ.b <- b.type[pos.b]
if(l.a & l.b) {
if(typ.a==typ.b) {
vec <- c(A[,pos.a],B[,pos.b])
} else {
warning(c("Type mismatch in variable named: ",all.names[i],"\n"))
vec <- try(c(A[,pos.a],B[,pos.b]))
}
} else if (l.a) {
vec <- c(A[,pos.a],rep(NA,nrow(B)))
} else {
vec <- c(rep(NA,nrow(A)),B[,pos.b])
}
C[[i]] <- vec
}
names(C) <- all.names
C <- as.data.frame(C)
return(C)
}
您可以将它们插入到原始数据库(db1)的末尾,并添加第二个数据库的行数。db2中不包括的列将显示NA值。
db1[nrow(db1)+1:nrow(db1)+nrow(db2), names(db2)] <- db2