R Merge Rows in a Data Frame

Question

R Merge Rows in a Data Frame

Here is a headbig data frame

head(Hdata_soil)
                      X_id           timestamp address rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1 565846060dd8e408e3817c58 2015-11-27 12:01:10      A8  -65     NA          NA          NA          NA              NA
2 565846070dd8e408e3817c59 2015-11-27 12:01:11      A8   NA     NA        9.73     -273.15       14.63             647
3 565846cf0dd8e408e3817caf 2015-11-27 12:04:31      A7  -64     NA          NA          NA          NA              NA
4 565846cf0dd8e408e3817cb0 2015-11-27 12:04:31      A7   NA     NA        8.56        9.46        9.64             660
5 565847650dd8e408e3817cf5 2015-11-27 12:07:01      A8  -64     NA          NA          NA          NA              NA
6 565847660dd8e408e3817cf6 2015-11-27 12:07:02      A8   NA     NA        9.82     -273.15       14.29             643

Access the complete dataset from dropbox

As you can see, for each addressthere are 2 consecutive observations with timestampsapproximately at a distance of 1 s. Variables are shared between these two observations. How can I merge them into one line while keeping the first timestamp?

It would also be great to make sure that this only happens with two consecutive observations from the same address.

I would really appreciate it if someone could point me in the right direction regarding the packages / features to use.

+4

r dataframe

Rory shaw Jan 7 '16 at 12:33

source share

4 answers

, . . (, , rssi, NA ..). rbind.data.frame .

unad <- unique(Hdata_soil$address)

lst <- lapply(unad, function(ad){
    recs <- Hdata_soil[Hdata_soil$address == ad,]
    X_id <- recs$X_id[1]
    ts <- min(recs$timestamp)
    rssi <- recs$rssi[!is.na(recs$rssi)]
    if(length(rssi) == 0L) rssi <- NA else if(length(rssi) >= 2L) rssi <- mean(rssi) # or something - ensure end up with length 1
    ## remaining observations like rssi
    ## ...
    return(data.frame(X_id = X_id, timestamp = ts, address = ad, rssi = rssi, ...))
})

result <- do.call(rbind.data.frame, lst)

:

mtx <- matrix(1:nrow(Hdata_soil), nrow(Hdata_soil), 2)
col.names(mtx) <- c("startR", "endR")

# identifies consecutive duplicate addresses and groups together into subsets
for(r in 1:(nrow(mtx) - 1)){
    with(Hdata_soil, if(identical(address[r], address[r + 1])){
        mtx[r, 2] <- mtx[r, 2] + 1
        mtx[r + 1,] <- c(NA, NA)
    })
}

#remove nas - essentially noting that duplicate addresses have been grouped
mtx <- mtx[!is.na(mtx[, 1]),]

lst <- lapply(1:nrow(mtx), function(r){
    datsubset <- Hdata_soil[mtx[r, "startR"]:mtx[r, "endR"],, drop = FALSE]
    # aggregate the subset of rows into one row as you please
})

result <- do.call(rbind.data.frame, lst)

. , - , .

0

Bazz 07 . '16 14:43

, , id, address , , . id :

df$id <- as.character(c(1,1,2,2,3,3))

# Replace NA by 0
df[is.na(df)] <- 0

# Extract numeric columns
tokeep <- which(sapply(df,is.numeric))

# Sum numeric columns per id
setDT(df)[,lapply(.SD,sum),by=id,.SDcols = tokeep]

:

   id rssi soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
1:  1  -65        9.73     -273.15       14.63             647
2:  2  -64        8.56        9.46        9.64             660
3:  3  -64        9.82     -273.15       14.29             643

Therefore, you can combine this with the remaining non-numeric columns of your original df by choosing which unique values you want to reset to X_idand timestamp.

0

mtoto Jan 7 '16 at 19:00

source share

this solution using dplyr may work if you are sure that rounding the “timestamp” to the nearest minute will give a unique identifier in combination with the “address”:

library(readr) # Required only for recreating your data frame
library(dplyr)

Hdata_soil <- readr::read_csv("X_id,timestamp,address,rssi,batt_v,soil_temp_1,soil_temp_2,soil_temp_3,soil_moisture_1
565846060dd8e408e3817c58,27/11/2015 12:01:10,A8,-65,NA,NA,NA,NA,NA
565846070dd8e408e3817c59,27/11/2015 12:01:11,A8,NA,NA,9.73,-273.15,14.63,647
565846cf0dd8e408e3817caf,27/11/2015 12:04:31,A7,-64,NA,NA,NA,NA,NA
565846cf0dd8e408e3817cb0,27/11/2015 12:04:31,A7,NA,NA,8.56,9.46,9.64,660
565847650dd8e408e3817cf5,27/11/2015 12:07:01,A8,-64,NA,NA,NA,NA,NA
565847660dd8e408e3817cf6,27/11/2015 12:07:02,A8,NA,NA,9.82,-273.15,14.29,643")

# Dplyr chain to create new vars, group then summarise
Hdata_soil <- dplyr::mutate(
  Hdata_soil,
  # Convert timestamp to POSIXct
  timestamp = as.POSIXct(strptime(timestamp, format = "%d/%m/%Y %H:%M:%S"))
  # Round to nearest minute
  , timestamp_round = as.POSIXct(round(timestamp, units = "mins"))
) %>% 
  # Group by nearest minute timestamps and address
  dplyr::group_by(timestamp_round, address) %>% 
  # Take minimum non-NA value
  dplyr::summarise_each(
    funs(min(., na.rm = TRUE))
  )

What gives:

> # Print
> Hdata_soil
Source: local data frame [3 x 10]
Groups: timestamp_round [?]

      timestamp_round address                     X_id           timestamp  rssi batt_v soil_temp_1 soil_temp_2 soil_temp_3 soil_moisture_1
               (time)   (chr)                    (chr)              (time) (int)  (lgl)       (dbl)       (dbl)       (dbl)           (int)
1 2015-11-27 12:01:00      A8 565846060dd8e408e3817c58 2015-11-27 12:01:10   -65     NA        9.73     -273.15       14.63             647
2 2015-11-27 12:05:00      A7 565846cf0dd8e408e3817caf 2015-11-27 12:04:31   -64     NA        8.56        9.46        9.64             660
3 2015-11-27 12:07:00      A8 565847650dd8e408e3817cf5 2015-11-27 12:07:01   -64     NA        9.82     -273.15       14.29             643

0

user5754437 Jan 7 '16 at 23:14

source share

fdetsch · Accepted Answer · 2016-01-07T14:01:30+0000

, . -, "POSIXlt", . foreach , ( "" ). which difftime (, 5 ). ( "" ), , .

## load 'foreach' package
library(foreach)

## import and reformat data
Hdata_soil <- read.csv("Hdata_soil.csv", header = TRUE, 
                       stringsAsFactors = FALSE)

## reformat timestamps
timestamps <- strptime(Hdata_soil$timestamp, format = "%Y-%m-%d %H:%M:%S")

## vector with information about merged lines
used <- integer()
dat_out <- foreach(i = 1:length(timestamps), .combine = "rbind") %do% {

  ## skip current iteration if line has already been merged into another line
  if (i %in% used)
    return(NULL)

  ## identify consecutive observation (<5s)
  x <- timestamps[i]
  y <- timestamps[(i+1):length(timestamps)]

  # (subset same or consecutive days to reduce 
  # computation time of 'difftime')
  id_day <- which(as.Date(y) == as.Date(x) | 
                    as.Date(y) == (as.Date(x) + 1))
  y <- y[id_day]

  # (subset records within 5s from current observation)
  id_sec <- which(difftime(y, x, units = "secs") < 5)
  id <- id_day[id_sec]

  ## if consecutive observation(s) exist(s) and include address of 
  ## current observation, perform merge
  if (length(id) > 0 & 
        any(Hdata_soil[i+id, "address"] == Hdata_soil[i, "address"])) {

    for (j in 1:length(id)) {
      Hdata_soil_x <- data.frame(Hdata_soil[i, ])
      Hdata_soil_y <- data.frame(Hdata_soil[i+id[j], ])

      # overwrite all missing values in current line with values 
      # from consecutive line
      Hdata_soil_x[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))] <- 
        Hdata_soil_y[which(is.na(Hdata_soil_x) & !is.na(Hdata_soil_y))]

      # update information about merged lines
      used <- c(used, i, i+id[j])
    }

    # return merged line
    return(Hdata_soil_x)

  ## else return current line as is  
  } else {
    used <- c(used, i)
    return(data.frame(Hdata_soil[i, ]))
  }
}

, , , -, difftime.

 >     user   system  elapsed 
 > 2209.504   99.389 2311.996

R Merge Rows in a Data Frame

More articles: