library(tidyverse)
stringr is built on top of stringi, which uses the ICU C library to provide fast, correct implementations of common string manipulations. stringr focusses on the most important and commonly used string manipulation functions whereas stringi provides a comprehensive set covering almost anything you can imagine. If you find that stringr is missing a function that you need, try looking in stringi. Both packages share similar conventions, so once you’ve mastered stringr, you should find stringi similarly easy to use.
fruit <- stringr::fruit[1:30]
sentences <- stringr::sentences[1:20]
foo <- c(' a bc d ')
str_view(string, pattern, match = NA)str_view_all(string, pattern, match = NA)str_view(fruit[1:5], '[aeiou]')
str_view_all(fruit[1:5], '[aeiou]')
[ ] \ ^ $ . | ? * + ( )\ (learn more about escaped characters: ?'"')
quote <- c(" ' ", ' " ', ' \' ', " \" ")
writeLines(quote)
#> '
#> "
#> '
#> "
str_view_all(c("ab\nc", "12d", "ae2"), "\n")
str_view_all(c("ab\nc", "12\bd", "ae2"), "\b")
str_view_all(c("ab\nc", "12\bd", "ae2"), "\\b")
str_view_all(c("ab\nc", "12\\d", "ae2"), "\\\\")
\d matches any digit(\D)
\s matches any whitespace (e.g. space, tab, newline)(\S)
\w match any word character,which includes alphabetic characters, marks and decimal numbers(\W)
\b matches word boundaries, the transition between word and non-word characters(\B)
str_view_all(c("abc", "12d", "ae2"), "\\d")
str_view_all(c("abc", "\\12d", "1e2"), "\\w")
str_view_all(c("a c", "\\1\td", "1e2"), "\\s")
str_view_all(c("a c", "\\1\td", "1e2"), "\\b")
| [] ()
str_view(c("longest", "lonlest", "lonaest"), "lon(g|l)est")
str_view(c("longest", "lonlest", "lonaest"), "lon[gal]est")
str_view(c("longest", "lonlest", "lonaest"), "lon[a-g]est")
str_view(c("longest", "lonlest", "lonaest"), "lon[^a-g]est")
. matches any character (except a newline)
x <- c("apple", "ba\nnana", "pear")
writeLines(x)
#> apple
#> ba
#> nana
#> pear
str_view_all(x, ".a.")
str_view_all(x, ".a\\b")
str_view_all(x, ".a\\n")
^/$ match the start/end of the string
str_view(c("a\\abpl^e", "bbna\bna", "pear", "aaa"), "^a")
str_view(c("apple", "banana", "pear"), "a$")
str_view(c("$^$"), "\\$\\^\\$")
Repetition
?: 0 or 1
+: 1 or more
*: 0 or more
{n}/{n,}/{n,m}
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "CC?")
str_view("1888 is the longest year in Roman numerals: MDCCCCLXXXVIII", "CC+")
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "CC*")
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "C{2,3}")
Grouping and backreferences
str_view(fruit[16:25], "(..)\\1")
str_view(c("bacdb","bacdbacd"), "^(.).*\\1$")
regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE)ignore_case: Should case differences be ignored in the match?multiline: If TRUE, “$” and “^” match the beginning and end of each line. If FALSE, the default, only match the start and end of the input.comments: If TRUE, white space and comments beginning with “#” are ignored. Escape literal spaces with “\”.dotall: If TRUE, “.” will also match line terminators.x <- c("apple", "ba\nnana", "pear")
str_view_all(x, regex(".A.", ignore_case=T, dotall = T))
str_view_all("A\nb", regex("^b", multiline=T))
str_detect(string, pattern, negate = FALSE): Detect the presence or absence of a pattern in a string.str_which(string, pattern, negate = FALSE): find positions.str_count(string, pattern = ""): Count the number of matches in a string.str_locate(string, pattern): returns an integer matrixstr_locate_all(string, pattern): returns a list of integer matricesfruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_detect(string, pattern, negate = FALSE)str_detect(fruit[1:5], 'a', negate = T)
#> [1] FALSE FALSE FALSE FALSE TRUE
test <- matrix(fruit[1:10],nrow=5)
str_detect(test,'a')
#> [1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
str_detect('appleapricot', fruit[1:5])
#> [1] TRUE TRUE FALSE FALSE FALSE
str_detect(fruit[1:5], c('a', 'b', 'c', 'd', 'e'))
#> [1] TRUE FALSE TRUE FALSE TRUE
str_which(string, pattern, negate = FALSE)str_which(fruit[1:5], 'a')
#> [1] 1 2 3 4
str_which(fruit, 'a', negate = T)
#> [1] 5 6 10 11 16 17 18 19 20 22 29
str_count(string, pattern = "")str_count(fruit[1:5], 'a')
#> [1] 1 1 2 3 0
str_count('abababa', 'aba')
#> [1] 2
str_locate(string, pattern)str_locate_all(string, pattern)str_locate(fruit[1:5], 'a')
#> start end
#> [1,] 1 1
#> [2,] 1 1
#> [3,] 1 1
#> [4,] 2 2
#> [5,] NA NA
str_locate_all(fruit[1:5], 'a')
#> [[1]]
#> start end
#> [1,] 1 1
#>
#> [[2]]
#> start end
#> [1,] 1 1
#>
#> [[3]]
#> start end
#> [1,] 1 1
#> [2,] 5 5
#>
#> [[4]]
#> start end
#> [1,] 2 2
#> [2,] 4 4
#> [3,] 6 6
#>
#> [[5]]
#> start end
str_sub(string, start = 1L, end = -1L) Extract and replace substrings from a character vector.str_subset(string, pattern, negate = FALSE) Keep strings matching a pattern, or find positions.str_extract(string, pattern) Extract matched groups from a string.str_extract_all(string, pattern, simplify = FALSE) Extract matched groups from a string.fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_sub(string, start = 1L, end = -1L)str_sub(fruit[1:5], 1, 3)
#> [1] "app" "apr" "avo" "ban" "bel"
str_sub(fruit[1:5], end = -2)
#> [1] "appl" "aprico" "avocad" "banan" "bell peppe"
str_sub(fruit[1:5], -2)
#> [1] "le" "ot" "do" "na" "er"
str_sub(fruit[1:5], c(1, 2, 3, 1, 1), c(3, 3, 4, 3, 3))
#> [1] "app" "pr" "oc" "ban" "bel"
# str_locate(fruit[1:5], '(..)\\1')
# str_sub(fruit[1:5], str_locate(fruit[1:5], '(..)\\1'))
str_subset(string, pattern, negate = FALSE)str_subset(fruit, 'b')
#> [1] "banana" "bell pepper" "bilberry" "blackberry" "blackcurrant"
#> [6] "blood orange" "blueberry" "boysenberry" "breadfruit" "cloudberry"
#> [11] "cranberry" "cucumber" "elderberry"
# fruit[str_detect(fruit,'b')]
str_extract(string, pattern)str_extract_all(string, pattern, simplify = FALSE)str_extract(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1')
#> [1] "aeae" NA "cccc"
str_extract(sentences, '\\w+(es|s)\\b')
#> [1] "planks" NA NA "days" "is" "lemons"
#> [7] "was" "hogs" "hours" "stockings" "was" "is"
#> [13] "is" NA NA "helps" "fires" NA
#> [19] "across" "bonds"
str_extract_all(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1')
#> [[1]]
#> [1] "aeae" "bcbc"
#>
#> [[2]]
#> character(0)
#>
#> [[3]]
#> [1] "cccc"
str_extract_all(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1', simplify = T)
#> [,1] [,2]
#> [1,] "aeae" "bcbc"
#> [2,] "" ""
#> [3,] "cccc" ""
str_match(string, pattern) Extract matched groups from a string.str_match_all(string, pattern)sentences[1:10]
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
#> [7] "The box was thrown beside the parked truck."
#> [8] "The hogs were fed chopped corn and garbage."
#> [9] "Four hours of steady work faced us."
#> [10] "Large size in stockings is hard to sell."
str_match(sentences[1:10], '(a|the) ([^ ]+)')
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#> [2,] "the sheet" "the" "sheet"
#> [3,] "the depth" "the" "depth"
#> [4,] "a chicken" "a" "chicken"
#> [5,] NA NA NA
#> [6,] NA NA NA
#> [7,] "the parked" "the" "parked"
#> [8,] NA NA NA
#> [9,] NA NA NA
#> [10,] NA NA NA
# str_extract(sentences[1:10], '(a|the) ([^ ]+)')
# str_match(sentences[1:10], '(a|the) ([^ ]+)')[,1]
str_match_all(sentences[1:5], '(a|the) ([^ ]+)')
#> [[1]]
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#>
#> [[2]]
#> [,1] [,2] [,3]
#> [1,] "the sheet" "the" "sheet"
#> [2,] "the dark" "the" "dark"
#>
#> [[3]]
#> [,1] [,2] [,3]
#> [1,] "the depth" "the" "depth"
#> [2,] "a well." "a" "well."
#>
#> [[4]]
#> [,1] [,2] [,3]
#> [1,] "a chicken" "a" "chicken"
#> [2,] "a rare" "a" "rare"
#>
#> [[5]]
#> [,1] [,2] [,3]
str_sub(string, start = 1L, end = -1L, omit_na=FALSE) <- valuestr_replace(string, pattern, replacement)str_replace_all(string, pattern, replacement)str_remove(string, pattern)str_remove_all(string, pattern)str_to_upper(string, locale = "en")str_to_lower(string, locale = "en")str_to_title(string, locale = "en")fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
fruit_temp <- fruit
str_sub(string, start = 1L, end = -1L, omit_na=FALSE) <- valuestr_sub(fruit_temp,1,3) <- 'str'
fruit_temp
#> [1] "strle" "stricot" "strcado" "strana" "strl pepper"
#> [6] "strberry" "strckberry" "strckcurrant" "strod orange" "streberry"
#> [11] "strsenberry" "stradfruit" "strary melon" "strtaloupe" "strrimoya"
#> [16] "strrry" "strli pepper" "strmentine" "strudberry" "stronut"
#> [21] "strnberry" "strumber" "strrant" "strson" "stre"
#> [26] "strgonfruit" "strian" "strplant" "strerberry" "strjoa"
str_replace(string, pattern, replacement)str_replace_all(string, pattern, replacement)str_replace(fruit[1:5], 'a', '-')
#> [1] "-pple" "-pricot" "-vocado" "b-nana" "bell pepper"
str_replace_all(fruit[1:5], 'a', '-')
#> [1] "-pple" "-pricot" "-voc-do" "b-n-n-" "bell pepper"
str_remove(string, pattern)str_remove_all(string, pattern)example <- c("apple", "apap", "ap")
str_remove(example, 'ap')
#> [1] "ple" "ap" ""
str_remove_all(example, 'ap')
#> [1] "ple" "" ""
str_remove_all(example, '[ap]')
#> [1] "le" "" ""
str_to_upper(string, locale = "en")str_to_lower(string, locale = "en")str_to_title(string, locale = "en")sentences[1:5]
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
str_to_upper(sentences[1:5])
#> [1] "THE BIRCH CANOE SLID ON THE SMOOTH PLANKS."
#> [2] "GLUE THE SHEET TO THE DARK BLUE BACKGROUND."
#> [3] "IT'S EASY TO TELL THE DEPTH OF A WELL."
#> [4] "THESE DAYS A CHICKEN LEG IS A RARE DISH."
#> [5] "RICE IS OFTEN SERVED IN ROUND BOWLS."
str_to_lower(sentences[1:5])
#> [1] "the birch canoe slid on the smooth planks."
#> [2] "glue the sheet to the dark blue background."
#> [3] "it's easy to tell the depth of a well."
#> [4] "these days a chicken leg is a rare dish."
#> [5] "rice is often served in round bowls."
str_to_title(sentences[1:5])
#> [1] "The Birch Canoe Slid On The Smooth Planks."
#> [2] "Glue The Sheet To The Dark Blue Background."
#> [3] "It's Easy To Tell The Depth Of A Well."
#> [4] "These Days A Chicken Leg Is A Rare Dish."
#> [5] "Rice Is Often Served In Round Bowls."
join
str_c(..., sep = "", collapse = NULL)
str_dup(string, times)
str_glue(...)
str_glue_data(.x, ..., .na = "NA")
Split
str_split(string, pattern, n = Inf, simplify = FALSE)
str_split_fixed(string, pattern, n)
str_c(..., sep = "", collapse = NULL):
letters[1:5]
#> [1] "a" "b" "c" "d" "e"
LETTERS[1:5]
#> [1] "A" "B" "C" "D" "E"
# for one part
str_c(letters[1:5], collapse =' ')
#> [1] "a b c d e"
# for two or more part
str_c(letters[1:5], LETTERS[1:5], sep = '/')
#> [1] "a/A" "b/B" "c/C" "d/D" "e/E"
str_c(letters[1:5], LETTERS[1:5], sep = '/', collapse = ' ')
#> [1] "a/A b/B c/C d/D e/E"
str_dup(string, times):
str_dup(fruit[1:6], times = 2)
#> [1] "appleapple" "apricotapricot" "avocadoavocado"
#> [4] "bananabanana" "bell pepperbell pepper" "bilberrybilberry"
str_dup(fruit[1:6], 1:3)
#> [1] "apple" "apricotapricot"
#> [3] "avocadoavocadoavocado" "banana"
#> [5] "bell pepperbell pepper" "bilberrybilberrybilberry"
str_glue(...)str_glue_data(.x, ..., .na = "NA")name <- 'Bob'
age <- 50
anniversary <- as.Date("1991-10-12")
str_glue(
"My name is {name}, ",
"my age next year is {age + 1}, ",
"and my anniversary is {format(anniversary, '%A, %B %d, %Y')}."
)
#> My name is Bob, my age next year is 51, and my anniversary is Saturday, October 12, 1991.
str_glue(
"My name is {name}, ",
"and my age next year is {age + 1}.",
name = "Joe",
age = 40
)
#> My name is Joe, and my age next year is 41.
str_glue('the {i}th letter is {letters[1:5]}', i = 1:5)
#> the 1th letter is a
#> the 2th letter is b
#> the 3th letter is c
#> the 4th letter is d
#> the 5th letter is e
# here str_glue() is more readable than str_c()
str_c('the ', 1:5, 'th letter is ', letters[1:5])
#> [1] "the 1th letter is a" "the 2th letter is b" "the 3th letter is c"
#> [4] "the 4th letter is d" "the 5th letter is e"
str_glue('the {i}th {{letter}} is {{{letters[1:5]}}', i=1:5)
#> the 1th {letter} is {a}
#> the 2th {letter} is {b}
#> the 3th {letter} is {c}
#> the 4th {letter} is {d}
#> the 5th {letter} is {e}
str_glue_data(mtcars[1:5, ], '{hp} hp')
#> 110 hp
#> 110 hp
#> 93 hp
#> 110 hp
#> 175 hp
mtcars[1:5, ] %>% str_glue_data('the {rownames(.)} has {hp} hp')
#> the Mazda RX4 has 110 hp
#> the Mazda RX4 Wag has 110 hp
#> the Datsun 710 has 93 hp
#> the Hornet 4 Drive has 110 hp
#> the Hornet Sportabout has 175 hp
str_split(string, pattern, n = Inf, simplify = FALSE)str_split_fixed(string, pattern, n)str_split(string, pattern, simplify = T) is equivalent to str_split_fixed(string, pattern, n = Inf)
fruits_split <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
str_split(fruits_split, " and ")
#> [[1]]
#> [1] "apples" "oranges" "pears" "bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
str_split(fruits_split, " and ", simplify = TRUE)
#> [,1] [,2] [,3] [,4]
#> [1,] "apples" "oranges" "pears" "bananas"
#> [2,] "pineapples" "mangos" "guavas" ""
# Specify n to restrict the number of possible matches
str_split(fruits_split, " and ", n = 3)
#> [[1]]
#> [1] "apples" "oranges" "pears and bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
str_split(fruits_split, " and ", n = 2)
#> [[1]]
#> [1] "apples" "oranges and pears and bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos and guavas"
# If n greater than number of pieces, no padding occurs
str_split(fruits_split, " and ", n = 5)
#> [[1]]
#> [1] "apples" "oranges" "pears" "bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
# Use str_split_fixed() to return a character matrix
str_split_fixed(fruits_split, " and ", 3)
#> [,1] [,2] [,3]
#> [1,] "apples" "oranges" "pears and bananas"
#> [2,] "pineapples" "mangos" "guavas"
str_split_fixed(fruits_split, " and ", 4)
#> [,1] [,2] [,3] [,4]
#> [1,] "apples" "oranges" "pears" "bananas"
#> [2,] "pineapples" "mangos" "guavas" ""
str_length(string)str_pad(string, width, side = c("left", "right", "both"), pad =" ") Pad a stringstr_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...") Truncate a character string.str_trim(string, side = c("both", "left", "right")) Trim whitespace from a stringstr_squish(string)fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_length(string)x <- c('adv', 'ss', 'awsd')
writeLines(x)
#> adv
#> ss
#> awsd
length(x)
#> [1] 3
str_length(x)
#> [1] 3 2 4
str_pad(string, width, side = c("left", "right", "both"), pad =" ") Pad a stringstr_pad(c("a", "abc", "abcdef"), 10, side = 'right')
#> [1] "a " "abc " "abcdef "
str_pad("a", c(5, 7, 10))
#> [1] " a" " a" " a"
str_pad("a", 10, pad = c("-", "_", " "))
#> [1] "---------a" "_________a" " a"
str_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...") Truncate a character string.str_trunc(fruit, 7)
#> [1] "apple" "apricot" "avocado" "banana" "bell..." "bilb..." "blac..."
#> [8] "blac..." "bloo..." "blue..." "boys..." "brea..." "cana..." "cant..."
#> [15] "cher..." "cherry" "chil..." "clem..." "clou..." "coconut" "cran..."
#> [22] "cucu..." "currant" "damson" "date" "drag..." "durian" "eggp..."
#> [29] "elde..." "feijoa"
str_trunc(fruit, 7, side = 'left', ellipsis = '**')
#> [1] "apple" "apricot" "avocado" "banana" "**epper" "**berry" "**berry"
#> [8] "**rrant" "**range" "**berry" "**berry" "**fruit" "**melon" "**loupe"
#> [15] "**imoya" "cherry" "**epper" "**ntine" "**berry" "coconut" "**berry"
#> [22] "**umber" "currant" "damson" "date" "**fruit" "durian" "**plant"
#> [29] "**berry" "feijoa"
str_trim(string, side = c("both", "left", "right")) Trim whitespace from a stringstr_squish(string)str_trim(c(' a', 'b ', 'c d'))
#> [1] "a" "b" "c d"
str_trim(c(' a', 'b ', 'c d'), side = 'left')
#> [1] "a" "b " "c d"
str_squish(c(' a', 'b ', 'c d'))
#> [1] "a" "b" "c d"
#str_replace_all(c(' a', 'b ', 'c d'), '\\s', '')
str_order(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...)str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...)bar <- c('b', 'c', 'a', 'd')
str_order(bar)
#> [1] 3 1 2 4
bar[str_order(bar)]
#> [1] "a" "b" "c" "d"
x <- c("100a10", "100a5", "2b", "2a")
str_sort(x)
#> [1] "100a10" "100a5" "2a" "2b"
str_sort(x, numeric = TRUE)
#> [1] "2a" "2b" "100a5" "100a10"
str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters?More details about regular expressions: CRAN | stringr/Regular expressions