Here's the approach I will take with this. I would create a function with some of the conditions that I would need to consider and use them in a tab. I added comments to explain what happens in the function.
The function has 4 arguments:
invec
: character input vector.thresh
: "" . = 5.minlen
: "". = 3.strict
: . nchar
thresh
, , ? = FALSE
. . , strict
.
myfun <- function(invec, thresh = 5, minlen = 3, strict = FALSE) {
invec <- sort(unique(toupper(invec)))
thresh <- if (isTRUE(strict)) thresh else min(min(nchar(invec)), thresh)
stubs <- invec[!duplicated(substr(invec, 1, thresh))]
unlist(
lapply(stubs, function(x) {
temp <- grep(x, invec, value = TRUE, fixed = TRUE)
temp[temp == x | nchar(temp) <= nchar(x) + minlen]
}),
use.names = FALSE)
}
:
dishes <- c("DAL BHAT", "DAL BHAT-(SPICY)", "DAL BHAT WITH EXTRA RICE",
"HAMBURGER", "HAMBURGER-BIG", "HAMBURGER2", "PIZZA",
"PIZZA (PROSCIUTO)", "PIZZA_BOLOGNESE")
:
myfun(dishes, minlen = 0)
# [1] "DAL BHAT" "HAMBURGER" "PIZZA"
myfun(dishes)
# [1] "DAL BHAT" "HAMBURGER" "HAMBURGER2" "PIZZA"
. , "dish2" , "DAL", "dish3" .
dishes2 <- c("DAL BHAT", "DAL BHAT-(SPICY)", "DAL BHAT WITH EXTRA RICE",
"HAMBURGER", "HAMBURGER-BIG", "HAMBURGER2", "PIZZA",
"PIZZA (PROSCIUTO)", "PIZZA_BOLOGNESE", "DAL")
dishes3 <- c("DAL BHAT", "DAL BHAT-(SPICY)", "DAL BHAT WITH EXTRA RICE",
"HAMBURGER", "HAMBURGER-BIG", "HAMBURGER2", "PIZZA",
"PIZZA (PROSCIUTO)", "PIZZA_BOLOGNESE", "DAL", "pizza!!")
:
myfun(dishes2, 4)
# [1] "DAL" "HAMBURGER" "HAMBURGER2" "PIZZA"
myfun(dishes3)
# [1] "DAL" "HAMBURGER" "HAMBURGER2" "PIZZA" "PIZZA!!"
myfun(dishes3, strict = TRUE)
# [1] "DAL" "DAL BHAT" "HAMBURGER" "HAMBURGER2" "PIZZA" "PIZZA!!"