R String match for address using stringdist, stringdistmatrix

I have two large data sets: about half a million records, and the other about 70K. These datasets have an address. I want to combine if any of the addresses in a smaller dataset is present in a large one. As you might expect, the address can be written in different ways and in different cases, so it’s rather unpleasant to see that there is no match when it should match, and there is a match when it should not match. I did some research and figured out which stringdist package to use. However, I am stuck and I feel that I am not using it to the full, and some suggestions on this subject will help.

The following are examples of dummy data along with the code I created to explain the situation.

Address1 <- c("786, GALI NO 5, XYZ","rambo, 45, strret 4, atlast, pqr","23/4, 23RD FLOOR, STREET 2, ABC-E, PQR","45-B, GALI NO5, XYZ","HECTIC, 99 STREET, PQR")
df1 <- data.table(Address1)

Address2 <- c("abc, pqr, xyz","786, GALI NO 4 XYZ","45B, GALI NO 5, XYZ","del, 546, strret2, towards east, pqr","23/4, STREET 2, PQR")
df2 <- data.table(Address2)

df1[, key_match := gsub("[^[:alnum:]]", "", Address1)]
df2[, key_match := gsub("[^[:alnum:]]", "", Address2)]

fn_match = function(str, strVec, n){
  strVec[amatch(str, strVec, method = "dl", maxDist=n,useBytes = T)]
}

df1[!is.na(key_match)
       , address_match := 
      fn_match(key_match, df2$key_match,3)
       ]

, address_match df1. , 30 . data.table. , .

. , , . . ? !

0
1

, data.table, , package:parallel

 rbind.pages(
  parallel::mclapply(Address1, function(i){
    data.frame(
       src = i, 
       match = Address2[which.min(adist(i, Address2))]
     )
   }, mc.cores = parallel::detectCores() - 2)) %>% 
 select(`src (Address1)`= 1, `match (Address2)` = 2)

:

                          src (Address1)                     match (Address2)
1                    786, GALI NO 5, XYZ                   786, GALI NO 4 XYZ
2       rambo, 45, strret 4, atlast, pqr del, 546, strret2, towards east, pqr
3 23/4, 23RD FLOOR, STREET 2, ABC-E, PQR                  23/4, STREET 2, PQR
4                    45-B, GALI NO5, XYZ                  45B, GALI NO 5, XYZ
5                 HECTIC, 99 STREET, PQR                  23/4, STREET 2, PQR

Edit:

, , , ; , ,

rand_addy_one <- rep(Address1, 1000)[sample(1:1000, 1000)]
rand_addy_two <- rep(Address2, 3000)[sample(1:3000, 3000)]


system.time({
  test_one <<- rbind.pages(parallel::mclapply(rand_addy_one, function(i) {
    calc <- as.data.frame(drop(attr(adist(i, rand_addy_two, counts = TRUE), "counts")))
    calc$totals <- (rowSums(calc))
    calc %>% mutate(src = i, target = rand_addy_two) %>% 
      filter(totals == min(totals))
  }, mc.cores = parallel::detectCores() - 2))  %>% 
    select(`source Address1` = src, `target Address2(matched)` = target,
           insertions = ins, deletions = del, substitutions = sub,
           total_approx_dist = totals)
})

   user  system elapsed 
 24.940   1.480   3.384 

> nrow(test_one)
[1] 600000

, :

system.time({
   test_two <<- rbind.pages(parallel::mclapply(rand_addy_two, function(i) {
    calc <- as.data.frame(drop(attr(adist(i, rand_addy_one, counts = TRUE), "counts")))
    calc$totals <- (rowSums(calc))
    calc %>% mutate(src = i, target = rand_addy_one) %>% 
        filter(totals == min(totals))
}, mc.cores = parallel::detectCores() - 2))  %>% 
    select(`source Address2` = src, `target Address1(matched)` = target,
           insertions = ins, deletions = del, substitutions = sub,
           total_approx_dist = totals)
})

   user  system elapsed 
 27.512   1.280   4.077 

nrow(test_two)
[1] 720000
0

Source: https://habr.com/ru/post/1661936/


All Articles