Reading a text file with different column widths but fixed delimiter in R

I have several .txt files that look like this:

header
header
header
header
header
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther Downg   Gray Fox                                                                           
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther Downg   Direct Register Walk, Gait, Gray Fox, Stop                                         
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther Downg   Gray Fox   

The width of the last two columns varies, but there are always 3 spaces between all columns (in this case, the third column is empty).

I use this code to read in the .txt example:

read.fwf(filename.txt,skip=5,widths=c(12,16,19,76,83),fill=T,fileEncoding = "UTF-16")

But this code will not work properly in this .txt:

header
header
header
header
header
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA AA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther DowngBBB   Gray Fox                                                                           
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA AA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther DowngBBB   Direct Register Walk, Gait, Gray Fox, Stop                                         
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\AAA AAAAAAAA\AAAAA AA\BBBB BBBB & BBBBB BBBBB\CAM_07-0008\Farther DowngBBB   Gray Fox   

Is there a way to read in a TXT file with a fixed delimiter (3 spaces) instead of determining the width of each column, since the width of the column depends on the files.

Files also have some encoding issues, so here is an example of the file I use

+4
source share
2 answers

, , multi- char, , . ( read.table, read.delim readr::read_delim) .

, , , , , - .

( .)

x <- readLines(textConnection(file1))
x <- x[x != 'header'] # or x <- x[-(1:5)]

( , header, , , "", .)

spl <- strsplit(x, '   ')
str(spl)
# List of 3
#  $ : chr [1:31] "01130009.JPG" "JPEG" "" "" ...
#  $ : chr [1:20] "01130009.JPG" "JPEG" "" "" ...
#  $ : chr [1:7] "01130009.JPG" "JPEG" "" "" ...

, , ...

spl[[1]]
#  [1] "01130009.JPG"                                                                
#  [2] "JPEG"                                                                        
#  [3] ""                                                                            
#  [4] ""                                                                            
#  [5] "2/5/2018 3:53:44 PM"                                                         
#  [6] "G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg"
#  [7] "Gray Fox"                                                                    
#  [8] ""                                                                            
#  [9] ""                                                                            
# [10] ""                                                                            
# [11] ""                                                                            
# [12] ""                                                                            
# [13] ""                                                                            
# [14] ""                                                                            
# [15] ""                                                                            
# [16] ""                                                                            
# [17] ""                                                                            
# [18] ""                                                                            
# [19] ""                                                                            
# [20] ""                                                                            
# [21] ""                                                                            
# [22] ""                                                                            
# [23] ""                                                                            
# [24] ""                                                                            
# [25] ""                                                                            
# [26] ""                                                                            
# [27] ""                                                                            
# [28] ""                                                                            
# [29] ""                                                                            
# [30] ""                                                                            
# [31] ""                                                                            

, , , :

spl <- lapply(spl, `[`, 1:7)

:

as.data.frame(do.call(rbind, spl), stringsAsFactors = FALSE)
#             V1   V2 V3 V4                  V5
# 1 01130009.JPG JPEG       2/5/2018 3:53:44 PM
# 2 01130009.JPG JPEG       2/5/2018 3:53:44 PM
# 3 01130009.JPG JPEG       2/5/2018 3:53:44 PM
#                                                                             V6
# 1 G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
# 2 G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
# 3 G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
#                                           V7
# 1                                   Gray Fox
# 2 Direct Register Walk, Gait, Gray Fox, Stop
# 3                                   Gray Fox

:

x <- readLines(textConnection(file2))
x <- x[x != 'header'] # or x <- x[-(1:5)]
spl <- lapply(strsplit(x, '   '), `[`, 1:7)
as.data.frame(do.call(rbind, spl), stringsAsFactors = FALSE)
#             V1   V2 V3 V4                  V5
# 1 01130009.JPG JPEG       2/5/2018 3:53:44 PM
# 2 01130009.JPG JPEG       2/5/2018 3:53:44 PM
# 3 01130009.JPG JPEG       2/5/2018 3:53:44 PM
#                                                                                   V6
# 1 G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB
# 2 G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB
# 3 G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB
#                                           V7
# 1                                   Gray Fox
# 2 Direct Register Walk, Gait, Gray Fox, Stop
# 3                                   Gray Fox

:

# note: replaced single '\' with double '\\' for R string-handling only
file1 <- 'header
header
header
header
header
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Gray Fox                                                                           
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Direct Register Walk, Gait, Gray Fox, Stop                                         
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Gray Fox   '
file2 <- 'header
header
header
header
header
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB   Gray Fox                                                                           
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB   Direct Register Walk, Gait, Gray Fox, Stop                                         
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA AA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther DowngBBB   Gray Fox   '
+4

, , gsub, 3 ( ):

> mytext = "01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Gray Fox
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Direct Register Walk, Gait, Gray Fox, Stop
01130009.JPG   JPEG         2/5/2018 3:53:44 PM   G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg   Gray Fox"

> ddf = read.table(text=gsub("   ", "|", mytext), header=F, sep="|")
> ddf 
            V1   V2 V3 V4                  V5                                                                           V6
1 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
2 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
3 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
                                          V7
1                                   Gray Fox
2 Direct Register Walk, Gait, Gray Fox, Stop
3                                   Gray Fox

: @r2evans , , gsub(" *$", "", ...). , R?:

trim.trailing <- function (x) sub("\\s+$", "", x)

readLines :

> mytext = readLines(file('testfile.txt')) # read file text
> mytext = mytext[-c(1:5)]           # remove first 5 rows ('header')
> mytext = gsub("\\s+$", "", mytext) # remove trailing spaces
> mytext = gsub("   ", "|", mytext)  # change separator
> ddf = read.table(text=mytext, header=F, sep='|') # read columns from text
> ddf
            V1   V2 V3 V4                  V5                                                                           V6
1 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
2 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
3 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
                                          V7
1                                   Gray Fox
2 Direct Register Walk, Gait, Gray Fox, Stop
3                                   Gray Fox

, data.frame , , :

> ddf1 = read.table(file='testfile.txt', sep = '\n', skip=5)
> mytext = gsub("\\s+$", "", unlist(ddf1$V1))
> ddf2 = read.table(text=gsub("   ", "|", mytext), header=F, sep='|')
> ddf2
            V1   V2 V3 V4                  V5                                                                           V6
1 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
2 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
3 01130009.JPG JPEG NA NA 2/5/2018 3:53:44 PM G:\\AAA AAAAAAAA\\AAAAA\\BBBB BBBB & BBBBB BBBBB\\CAM_07-0008\\Farther Downg
                                          V7
1                                   Gray Fox
2 Direct Register Walk, Gait, Gray Fox, Stop
3                                   Gray Fox
+2

Source: https://habr.com/ru/post/1694983/


All Articles