library(tidyverse)
stringr is built on top of stringi, which uses the ICU C library to provide fast, correct implementations of common string manipulations. stringr focusses on the most important and commonly used string manipulation functions whereas stringi provides a comprehensive set covering almost anything you can imagine. If you find that stringr is missing a function that you need, try looking in stringi. Both packages share similar conventions, so once you’ve mastered stringr, you should find stringi similarly easy to use.
fruit <- stringr::fruit[1:30]
sentences <- stringr::sentences[1:20]
foo <- c(' a bc d ')
str_view(string, pattern, match = NA)
str_view_all(string, pattern, match = NA)
str_view(fruit[1:5], '[aeiou]')
str_view_all(fruit[1:5], '[aeiou]')
[ ]
\
^
$
.
|
?
*
+
( )
\
(learn more about escaped characters: ?'"'
)
quote <- c(" ' ", ' " ', ' \' ', " \" ")
writeLines(quote)
#> '
#> "
#> '
#> "
str_view_all(c("ab\nc", "12d", "ae2"), "\n")
str_view_all(c("ab\nc", "12\bd", "ae2"), "\b")
str_view_all(c("ab\nc", "12\bd", "ae2"), "\\b")
str_view_all(c("ab\nc", "12\\d", "ae2"), "\\\\")
\d
matches any digit(\D
)
\s
matches any whitespace (e.g. space, tab, newline)(\S
)
\w
match any word character,which includes alphabetic characters, marks and decimal numbers(\W
)
\b
matches word boundaries, the transition between word and non-word characters(\B
)
str_view_all(c("abc", "12d", "ae2"), "\\d")
str_view_all(c("abc", "\\12d", "1e2"), "\\w")
str_view_all(c("a c", "\\1\td", "1e2"), "\\s")
str_view_all(c("a c", "\\1\td", "1e2"), "\\b")
| [] ()
str_view(c("longest", "lonlest", "lonaest"), "lon(g|l)est")
str_view(c("longest", "lonlest", "lonaest"), "lon[gal]est")
str_view(c("longest", "lonlest", "lonaest"), "lon[a-g]est")
str_view(c("longest", "lonlest", "lonaest"), "lon[^a-g]est")
.
matches any character (except a newline)
x <- c("apple", "ba\nnana", "pear")
writeLines(x)
#> apple
#> ba
#> nana
#> pear
str_view_all(x, ".a.")
str_view_all(x, ".a\\b")
str_view_all(x, ".a\\n")
^
/$
match the start/end of the string
str_view(c("a\\abpl^e", "bbna\bna", "pear", "aaa"), "^a")
str_view(c("apple", "banana", "pear"), "a$")
str_view(c("$^$"), "\\$\\^\\$")
Repetition
?
: 0 or 1
+
: 1 or more
*
: 0 or more
{n}
/{n,}
/{n,m}
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "CC?")
str_view("1888 is the longest year in Roman numerals: MDCCCCLXXXVIII", "CC+")
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "CC*")
str_view("1888 is the longest year in Roman numerals: MDCCCLXXXVIII", "C{2,3}")
Grouping and backreferences
str_view(fruit[16:25], "(..)\\1")
str_view(c("bacdb","bacdbacd"), "^(.).*\\1$")
regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE)
ignore_case
: Should case differences be ignored in the match?multiline
: If TRUE
, “$” and “^” match the beginning and end of each line. If FALSE
, the default, only match the start and end of the input.comments
: If TRUE
, white space and comments beginning with “#” are ignored. Escape literal spaces with “\”.dotall
: If TRUE
, “.” will also match line terminators.x <- c("apple", "ba\nnana", "pear")
str_view_all(x, regex(".A.", ignore_case=T, dotall = T))
str_view_all("A\nb", regex("^b", multiline=T))
str_detect(string, pattern, negate = FALSE)
: Detect the presence or absence of a pattern in a string.str_which(string, pattern, negate = FALSE)
: find positions.str_count(string, pattern = "")
: Count the number of matches in a string.str_locate(string, pattern)
: returns an integer matrixstr_locate_all(string, pattern)
: returns a list of integer matricesfruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_detect(string, pattern, negate = FALSE)
str_detect(fruit[1:5], 'a', negate = T)
#> [1] FALSE FALSE FALSE FALSE TRUE
test <- matrix(fruit[1:10],nrow=5)
str_detect(test,'a')
#> [1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
str_detect('appleapricot', fruit[1:5])
#> [1] TRUE TRUE FALSE FALSE FALSE
str_detect(fruit[1:5], c('a', 'b', 'c', 'd', 'e'))
#> [1] TRUE FALSE TRUE FALSE TRUE
str_which(string, pattern, negate = FALSE)
str_which(fruit[1:5], 'a')
#> [1] 1 2 3 4
str_which(fruit, 'a', negate = T)
#> [1] 5 6 10 11 16 17 18 19 20 22 29
str_count(string, pattern = "")
str_count(fruit[1:5], 'a')
#> [1] 1 1 2 3 0
str_count('abababa', 'aba')
#> [1] 2
str_locate(string, pattern)
str_locate_all(string, pattern)
str_locate(fruit[1:5], 'a')
#> start end
#> [1,] 1 1
#> [2,] 1 1
#> [3,] 1 1
#> [4,] 2 2
#> [5,] NA NA
str_locate_all(fruit[1:5], 'a')
#> [[1]]
#> start end
#> [1,] 1 1
#>
#> [[2]]
#> start end
#> [1,] 1 1
#>
#> [[3]]
#> start end
#> [1,] 1 1
#> [2,] 5 5
#>
#> [[4]]
#> start end
#> [1,] 2 2
#> [2,] 4 4
#> [3,] 6 6
#>
#> [[5]]
#> start end
str_sub(string, start = 1L, end = -1L)
Extract and replace substrings from a character vector.str_subset(string, pattern, negate = FALSE)
Keep strings matching a pattern, or find positions.str_extract(string, pattern)
Extract matched groups from a string.str_extract_all(string, pattern, simplify = FALSE)
Extract matched groups from a string.fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_sub(string, start = 1L, end = -1L)
str_sub(fruit[1:5], 1, 3)
#> [1] "app" "apr" "avo" "ban" "bel"
str_sub(fruit[1:5], end = -2)
#> [1] "appl" "aprico" "avocad" "banan" "bell peppe"
str_sub(fruit[1:5], -2)
#> [1] "le" "ot" "do" "na" "er"
str_sub(fruit[1:5], c(1, 2, 3, 1, 1), c(3, 3, 4, 3, 3))
#> [1] "app" "pr" "oc" "ban" "bel"
# str_locate(fruit[1:5], '(..)\\1')
# str_sub(fruit[1:5], str_locate(fruit[1:5], '(..)\\1'))
str_subset(string, pattern, negate = FALSE)
str_subset(fruit, 'b')
#> [1] "banana" "bell pepper" "bilberry" "blackberry" "blackcurrant"
#> [6] "blood orange" "blueberry" "boysenberry" "breadfruit" "cloudberry"
#> [11] "cranberry" "cucumber" "elderberry"
# fruit[str_detect(fruit,'b')]
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)
str_extract(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1')
#> [1] "aeae" NA "cccc"
str_extract(sentences, '\\w+(es|s)\\b')
#> [1] "planks" NA NA "days" "is" "lemons"
#> [7] "was" "hogs" "hours" "stockings" "was" "is"
#> [13] "is" NA NA "helps" "fires" NA
#> [19] "across" "bonds"
str_extract_all(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1')
#> [[1]]
#> [1] "aeae" "bcbc"
#>
#> [[2]]
#> character(0)
#>
#> [[3]]
#> [1] "cccc"
str_extract_all(c('aeaebcbc','bcaebc','abcccc'),'(..)\\1', simplify = T)
#> [,1] [,2]
#> [1,] "aeae" "bcbc"
#> [2,] "" ""
#> [3,] "cccc" ""
str_match(string, pattern)
Extract matched groups from a string.str_match_all(string, pattern)
sentences[1:10]
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
#> [7] "The box was thrown beside the parked truck."
#> [8] "The hogs were fed chopped corn and garbage."
#> [9] "Four hours of steady work faced us."
#> [10] "Large size in stockings is hard to sell."
str_match(sentences[1:10], '(a|the) ([^ ]+)')
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#> [2,] "the sheet" "the" "sheet"
#> [3,] "the depth" "the" "depth"
#> [4,] "a chicken" "a" "chicken"
#> [5,] NA NA NA
#> [6,] NA NA NA
#> [7,] "the parked" "the" "parked"
#> [8,] NA NA NA
#> [9,] NA NA NA
#> [10,] NA NA NA
# str_extract(sentences[1:10], '(a|the) ([^ ]+)')
# str_match(sentences[1:10], '(a|the) ([^ ]+)')[,1]
str_match_all(sentences[1:5], '(a|the) ([^ ]+)')
#> [[1]]
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#>
#> [[2]]
#> [,1] [,2] [,3]
#> [1,] "the sheet" "the" "sheet"
#> [2,] "the dark" "the" "dark"
#>
#> [[3]]
#> [,1] [,2] [,3]
#> [1,] "the depth" "the" "depth"
#> [2,] "a well." "a" "well."
#>
#> [[4]]
#> [,1] [,2] [,3]
#> [1,] "a chicken" "a" "chicken"
#> [2,] "a rare" "a" "rare"
#>
#> [[5]]
#> [,1] [,2] [,3]
str_sub(string, start = 1L, end = -1L, omit_na=FALSE) <- value
str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)
str_remove(string, pattern)
str_remove_all(string, pattern)
str_to_upper(string, locale = "en")
str_to_lower(string, locale = "en")
str_to_title(string, locale = "en")
fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
fruit_temp <- fruit
str_sub(string, start = 1L, end = -1L, omit_na=FALSE) <- value
str_sub(fruit_temp,1,3) <- 'str'
fruit_temp
#> [1] "strle" "stricot" "strcado" "strana" "strl pepper"
#> [6] "strberry" "strckberry" "strckcurrant" "strod orange" "streberry"
#> [11] "strsenberry" "stradfruit" "strary melon" "strtaloupe" "strrimoya"
#> [16] "strrry" "strli pepper" "strmentine" "strudberry" "stronut"
#> [21] "strnberry" "strumber" "strrant" "strson" "stre"
#> [26] "strgonfruit" "strian" "strplant" "strerberry" "strjoa"
str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)
str_replace(fruit[1:5], 'a', '-')
#> [1] "-pple" "-pricot" "-vocado" "b-nana" "bell pepper"
str_replace_all(fruit[1:5], 'a', '-')
#> [1] "-pple" "-pricot" "-voc-do" "b-n-n-" "bell pepper"
str_remove(string, pattern)
str_remove_all(string, pattern)
example <- c("apple", "apap", "ap")
str_remove(example, 'ap')
#> [1] "ple" "ap" ""
str_remove_all(example, 'ap')
#> [1] "ple" "" ""
str_remove_all(example, '[ap]')
#> [1] "le" "" ""
str_to_upper(string, locale = "en")
str_to_lower(string, locale = "en")
str_to_title(string, locale = "en")
sentences[1:5]
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
str_to_upper(sentences[1:5])
#> [1] "THE BIRCH CANOE SLID ON THE SMOOTH PLANKS."
#> [2] "GLUE THE SHEET TO THE DARK BLUE BACKGROUND."
#> [3] "IT'S EASY TO TELL THE DEPTH OF A WELL."
#> [4] "THESE DAYS A CHICKEN LEG IS A RARE DISH."
#> [5] "RICE IS OFTEN SERVED IN ROUND BOWLS."
str_to_lower(sentences[1:5])
#> [1] "the birch canoe slid on the smooth planks."
#> [2] "glue the sheet to the dark blue background."
#> [3] "it's easy to tell the depth of a well."
#> [4] "these days a chicken leg is a rare dish."
#> [5] "rice is often served in round bowls."
str_to_title(sentences[1:5])
#> [1] "The Birch Canoe Slid On The Smooth Planks."
#> [2] "Glue The Sheet To The Dark Blue Background."
#> [3] "It's Easy To Tell The Depth Of A Well."
#> [4] "These Days A Chicken Leg Is A Rare Dish."
#> [5] "Rice Is Often Served In Round Bowls."
join
str_c(..., sep = "", collapse = NULL)
str_dup(string, times)
str_glue(...)
str_glue_data(.x, ..., .na = "NA")
Split
str_split(string, pattern, n = Inf, simplify = FALSE)
str_split_fixed(string, pattern, n)
str_c(..., sep = "", collapse = NULL)
:
letters[1:5]
#> [1] "a" "b" "c" "d" "e"
LETTERS[1:5]
#> [1] "A" "B" "C" "D" "E"
# for one part
str_c(letters[1:5], collapse =' ')
#> [1] "a b c d e"
# for two or more part
str_c(letters[1:5], LETTERS[1:5], sep = '/')
#> [1] "a/A" "b/B" "c/C" "d/D" "e/E"
str_c(letters[1:5], LETTERS[1:5], sep = '/', collapse = ' ')
#> [1] "a/A b/B c/C d/D e/E"
str_dup(string, times)
:
str_dup(fruit[1:6], times = 2)
#> [1] "appleapple" "apricotapricot" "avocadoavocado"
#> [4] "bananabanana" "bell pepperbell pepper" "bilberrybilberry"
str_dup(fruit[1:6], 1:3)
#> [1] "apple" "apricotapricot"
#> [3] "avocadoavocadoavocado" "banana"
#> [5] "bell pepperbell pepper" "bilberrybilberrybilberry"
str_glue(...)
str_glue_data(.x, ..., .na = "NA")
name <- 'Bob'
age <- 50
anniversary <- as.Date("1991-10-12")
str_glue(
"My name is {name}, ",
"my age next year is {age + 1}, ",
"and my anniversary is {format(anniversary, '%A, %B %d, %Y')}."
)
#> My name is Bob, my age next year is 51, and my anniversary is Saturday, October 12, 1991.
str_glue(
"My name is {name}, ",
"and my age next year is {age + 1}.",
name = "Joe",
age = 40
)
#> My name is Joe, and my age next year is 41.
str_glue('the {i}th letter is {letters[1:5]}', i = 1:5)
#> the 1th letter is a
#> the 2th letter is b
#> the 3th letter is c
#> the 4th letter is d
#> the 5th letter is e
# here str_glue() is more readable than str_c()
str_c('the ', 1:5, 'th letter is ', letters[1:5])
#> [1] "the 1th letter is a" "the 2th letter is b" "the 3th letter is c"
#> [4] "the 4th letter is d" "the 5th letter is e"
str_glue('the {i}th {{letter}} is {{{letters[1:5]}}', i=1:5)
#> the 1th {letter} is {a}
#> the 2th {letter} is {b}
#> the 3th {letter} is {c}
#> the 4th {letter} is {d}
#> the 5th {letter} is {e}
str_glue_data(mtcars[1:5, ], '{hp} hp')
#> 110 hp
#> 110 hp
#> 93 hp
#> 110 hp
#> 175 hp
mtcars[1:5, ] %>% str_glue_data('the {rownames(.)} has {hp} hp')
#> the Mazda RX4 has 110 hp
#> the Mazda RX4 Wag has 110 hp
#> the Datsun 710 has 93 hp
#> the Hornet 4 Drive has 110 hp
#> the Hornet Sportabout has 175 hp
str_split(string, pattern, n = Inf, simplify = FALSE)
str_split_fixed(string, pattern, n)
str_split(string, pattern, simplify = T)
is equivalent to str_split_fixed(string, pattern, n = Inf)
fruits_split <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
str_split(fruits_split, " and ")
#> [[1]]
#> [1] "apples" "oranges" "pears" "bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
str_split(fruits_split, " and ", simplify = TRUE)
#> [,1] [,2] [,3] [,4]
#> [1,] "apples" "oranges" "pears" "bananas"
#> [2,] "pineapples" "mangos" "guavas" ""
# Specify n to restrict the number of possible matches
str_split(fruits_split, " and ", n = 3)
#> [[1]]
#> [1] "apples" "oranges" "pears and bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
str_split(fruits_split, " and ", n = 2)
#> [[1]]
#> [1] "apples" "oranges and pears and bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos and guavas"
# If n greater than number of pieces, no padding occurs
str_split(fruits_split, " and ", n = 5)
#> [[1]]
#> [1] "apples" "oranges" "pears" "bananas"
#>
#> [[2]]
#> [1] "pineapples" "mangos" "guavas"
# Use str_split_fixed() to return a character matrix
str_split_fixed(fruits_split, " and ", 3)
#> [,1] [,2] [,3]
#> [1,] "apples" "oranges" "pears and bananas"
#> [2,] "pineapples" "mangos" "guavas"
str_split_fixed(fruits_split, " and ", 4)
#> [,1] [,2] [,3] [,4]
#> [1,] "apples" "oranges" "pears" "bananas"
#> [2,] "pineapples" "mangos" "guavas" ""
str_length(string)
str_pad(string, width, side = c("left", "right", "both"), pad =" ")
Pad a stringstr_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...")
Truncate a character string.str_trim(string, side = c("both", "left", "right"))
Trim whitespace from a stringstr_squish(string)
fruit
#> [1] "apple" "apricot" "avocado" "banana" "bell pepper"
#> [6] "bilberry" "blackberry" "blackcurrant" "blood orange" "blueberry"
#> [11] "boysenberry" "breadfruit" "canary melon" "cantaloupe" "cherimoya"
#> [16] "cherry" "chili pepper" "clementine" "cloudberry" "coconut"
#> [21] "cranberry" "cucumber" "currant" "damson" "date"
#> [26] "dragonfruit" "durian" "eggplant" "elderberry" "feijoa"
str_length(string)
x <- c('adv', 'ss', 'awsd')
writeLines(x)
#> adv
#> ss
#> awsd
length(x)
#> [1] 3
str_length(x)
#> [1] 3 2 4
str_pad(string, width, side = c("left", "right", "both"), pad =" ")
Pad a stringstr_pad(c("a", "abc", "abcdef"), 10, side = 'right')
#> [1] "a " "abc " "abcdef "
str_pad("a", c(5, 7, 10))
#> [1] " a" " a" " a"
str_pad("a", 10, pad = c("-", "_", " "))
#> [1] "---------a" "_________a" " a"
str_trunc(string, width, side = c("right", "left", "center"), ellipsis = "...")
Truncate a character string.str_trunc(fruit, 7)
#> [1] "apple" "apricot" "avocado" "banana" "bell..." "bilb..." "blac..."
#> [8] "blac..." "bloo..." "blue..." "boys..." "brea..." "cana..." "cant..."
#> [15] "cher..." "cherry" "chil..." "clem..." "clou..." "coconut" "cran..."
#> [22] "cucu..." "currant" "damson" "date" "drag..." "durian" "eggp..."
#> [29] "elde..." "feijoa"
str_trunc(fruit, 7, side = 'left', ellipsis = '**')
#> [1] "apple" "apricot" "avocado" "banana" "**epper" "**berry" "**berry"
#> [8] "**rrant" "**range" "**berry" "**berry" "**fruit" "**melon" "**loupe"
#> [15] "**imoya" "cherry" "**epper" "**ntine" "**berry" "coconut" "**berry"
#> [22] "**umber" "currant" "damson" "date" "**fruit" "durian" "**plant"
#> [29] "**berry" "feijoa"
str_trim(string, side = c("both", "left", "right"))
Trim whitespace from a stringstr_squish(string)
str_trim(c(' a', 'b ', 'c d'))
#> [1] "a" "b" "c d"
str_trim(c(' a', 'b ', 'c d'), side = 'left')
#> [1] "a" "b " "c d"
str_squish(c(' a', 'b ', 'c d'))
#> [1] "a" "b" "c d"
#str_replace_all(c(' a', 'b ', 'c d'), '\\s', '')
str_order(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...)
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "en", numeric = FALSE, ...)
bar <- c('b', 'c', 'a', 'd')
str_order(bar)
#> [1] 3 1 2 4
bar[str_order(bar)]
#> [1] "a" "b" "c" "d"
x <- c("100a10", "100a5", "2b", "2a")
str_sort(x)
#> [1] "100a10" "100a5" "2a" "2b"
str_sort(x, numeric = TRUE)
#> [1] "2a" "2b" "100a5" "100a10"
str_length()
and str_sub()
to extract the middle character from a string. What will you do if the string has an even number of characters?More details about regular expressions: CRAN | stringr/Regular expressions