Read files into R

library(tidyverse)

You can download all the supported files: text.csv text.zip import_test.xlsx training.sas7bdat and put them in your folder: ./data/

Import a delimited file (including csv & tsv) into a tibble

readr package

Functions Usage
read_csv Read a comma-delimited file
read_csv2 Read a semicolon-delimited file
read_tsv Read a tab-delimited file
read_delim Read a general-delimited file
read_fwf Read a fixed width file
read_table Read whitespace-separated columns
read_log Read common/combined log file

Read a comma-delimited file: read_csv

Arguments Default Option
file location a file, a connection, or literal data
col_names TRUE TRUE/FALSE(x1-xn)
character vector of column names
col_types cols(var1 = col_character())
cols_only(X1 = col_character())
cols(var1 = col_skip())
c("ic_")
col_types = list(.default = col_character())
locale default_locale() default or customize
na c("", "NA") a vector, c(" ", "NA", ".")
quote "\"" "\""
n_max Inf integer
skip 0 integer
guess_max Inf or 1000 integer
trim_ws TRUE TRUE/FALSE
skip_empty_rows TRUE TRUE/FALSE

Arguments

  1. file: A path, a connection, or literal data

    read_csv("x,y\n1,2\n3,4")
    #> # A tibble: 2 x 2
    #>       x     y
    #>   <dbl> <dbl>
    #> 1     1     2
    #> 2     3     4
    
    read_csv("./data/text.csv")
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #>  2 BGB-3111-217    65 .               "World"       <NA>      
    #>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
    #>  6 <NA>            NA <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
    #>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
    #> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217    58 UAT             "World"       UATU0001
    
    read_csv("./data/text.zip")
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   .default = col_character(),
    #>   SiteID = col_double(),
    #>   SubjectID = col_double(),
    #>   CRFVersion = col_double()
    #> )
    #> ℹ Use `spec()` for the full column specifications.
    #> # A tibble: 240 x 20
    #>    StudyName  EnvironmentName SiteGroupName SiteID SiteNumber SiteName SubjectID
    #>    <chr>      <chr>           <chr>          <dbl> <chr>      <chr>        <dbl>
    #>  1 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    258577
    #>  2 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    303338
    #>  3 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    305387
    #>  4 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    309326
    #>  5 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    312356
    #>  6 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    320585
    #>  7 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    362433
    #>  8 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    363219
    #>  9 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    316194
    #> 10 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    316250
    #> 11 BGB-3111-… UAT             World           1722 UATU0002   BGB UAT…    259154
    #> # … with 229 more rows, and 13 more variables: SubjectName <chr>,
    #> #   CRFVersion <dbl>, SubjectStatus <chr>, Created <chr>, ParamStudy <chr>,
    #> #   ParamEnvironment <chr>, ParamSiteGroup <chr>, ParamSites <chr>,
    #> #   ParamSubjects <chr>, Login <chr>, RoleName <chr>, PrintDateTime <chr>,
    #> #   TimeZone <chr>
    
    read_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   mpg = col_double(),
    #>   cyl = col_double(),
    #>   disp = col_double(),
    #>   hp = col_double(),
    #>   drat = col_double(),
    #>   wt = col_double(),
    #>   qsec = col_double(),
    #>   vs = col_double(),
    #>   am = col_double(),
    #>   gear = col_double(),
    #>   carb = col_double()
    #> )
    #> # A tibble: 32 x 11
    #>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
    #>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    #>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
    #>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
    #>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
    #>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
    #>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
    #>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
    #>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
    #>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
    #>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
    #> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
    #> 11  17.8     6  168.   123  3.92  3.44  18.9     1     0     4     4
    #> # … with 21 more rows
  2. col_names: Define var names–either TRUE, FALSE or a character vector of column names, TRUE by default.

    read_csv("./data/text.csv", col_names = TRUE)
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #>  2 BGB-3111-217    65 .               "World"       <NA>      
    #>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
    #>  6 <NA>            NA <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
    #>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
    #> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217    58 UAT             "World"       UATU0001
    
    read_csv("./data/text.csv", col_names = FALSE)
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   X1 = col_character(),
    #>   X2 = col_character(),
    #>   X3 = col_character(),
    #>   X4 = col_character(),
    #>   X5 = col_character()
    #> )
    #> # A tibble: 12 x 5
    #>    X1           X2    X3              X4              X5        
    #>    <chr>        <chr> <chr>           <chr>           <chr>     
    #>  1 StudyName    age   EnvironmentName "SiteGroupName" SiteNumber
    #>  2 BGB-3111-217 56    UAT             "World"         UATU0001  
    #>  3 BGB-3111-217 65    .               "World"         <NA>      
    #>  4 BGB-3111-217 45    UAT             "\"\""          UATU0001  
    #>  5 BGB-3111-217 65    UAT              <NA>           UATU0001  
    #>  6 BGB-3111-217 21    UAT             "World"         UATU0001  
    #>  7 <NA>         <NA>  <NA>             <NA>           <NA>      
    #>  8 BGB-3111-217 54    UAT             "World"         UATU#0001 
    #>  9 BGB-3111-217 78    UAT             "World"         UATU0001  
    #> 10 BGB-3111-217 89    UAT             "World"         UATU0001  
    #> 11 BGB-3111-217 36    UAT             "World"         #UATU0001 
    #> # … with 1 more row
    
    read_csv("./data/text.csv", col_names = c("var1", "var2", "var3"))
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   var1 = col_character(),
    #>   var2 = col_character(),
    #>   var3 = col_character()
    #> )
    #> Warning: 12 parsing failures.
    #> row col  expected    actual              file
    #>   1  -- 3 columns 5 columns './data/text.csv'
    #>   2  -- 3 columns 5 columns './data/text.csv'
    #>   3  -- 3 columns 5 columns './data/text.csv'
    #>   4  -- 3 columns 5 columns './data/text.csv'
    #>   5  -- 3 columns 5 columns './data/text.csv'
    #> ... ... ......... ......... .................
    #> See problems(...) for more details.
    #> # A tibble: 12 x 3
    #>    var1         var2  var3           
    #>    <chr>        <chr> <chr>          
    #>  1 StudyName    age   EnvironmentName
    #>  2 BGB-3111-217 56    UAT            
    #>  3 BGB-3111-217 65    .              
    #>  4 BGB-3111-217 45    UAT            
    #>  5 BGB-3111-217 65    UAT            
    #>  6 BGB-3111-217 21    UAT            
    #>  7 <NA>         <NA>  <NA>           
    #>  8 BGB-3111-217 54    UAT            
    #>  9 BGB-3111-217 78    UAT            
    #> 10 BGB-3111-217 89    UAT            
    #> 11 BGB-3111-217 36    UAT            
    #> # … with 1 more row
  3. col_types: Define var types–NULL, a cols() specification, or a string, “GUESS” by default.

    c = character i = integer n = number
    d = double l = logical f = factor
    D = date T = date time t = time
    ? = guess _ or - = skip
    read_csv("./data/text.csv", col_types = cols(age = col_character()))
    #> # A tibble: 11 x 5
    #>    StudyName    age   EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <chr> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217 56    UAT             "World"       UATU0001  
    #>  2 BGB-3111-217 65    .               "World"       <NA>      
    #>  3 BGB-3111-217 45    UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217 65    UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217 21    UAT             "World"       UATU0001  
    #>  6 <NA>         <NA>  <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217 54    UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217 78    UAT             "World"       UATU0001  
    #>  9 BGB-3111-217 89    UAT             "World"       UATU0001  
    #> 10 BGB-3111-217 36    UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217 58    UAT             "World"       UATU0001
    
    read_csv("./data/text.csv", col_types = cols_only(StudyName = col_character(), SiteNumber = col_character()))
    #> # A tibble: 11 x 2
    #>    StudyName    SiteNumber
    #>    <chr>        <chr>     
    #>  1 BGB-3111-217 UATU0001  
    #>  2 BGB-3111-217 <NA>      
    #>  3 BGB-3111-217 UATU0001  
    #>  4 BGB-3111-217 UATU0001  
    #>  5 BGB-3111-217 UATU0001  
    #>  6 <NA>         <NA>      
    #>  7 BGB-3111-217 UATU#0001 
    #>  8 BGB-3111-217 UATU0001  
    #>  9 BGB-3111-217 UATU0001  
    #> 10 BGB-3111-217 #UATU0001 
    #> 11 BGB-3111-217 UATU0001
    
    read_csv("./data/text.csv", col_types = cols(SiteNumber = col_skip(), StudyName = col_skip()))
    #> # A tibble: 11 x 3
    #>      age EnvironmentName SiteGroupName
    #>    <dbl> <chr>           <chr>        
    #>  1    56 UAT             "World"      
    #>  2    65 .               "World"      
    #>  3    45 UAT             "\"\""       
    #>  4    65 UAT              <NA>        
    #>  5    21 UAT             "World"      
    #>  6    NA <NA>             <NA>        
    #>  7    54 UAT             "World"      
    #>  8    78 UAT             "World"      
    #>  9    89 UAT             "World"      
    #> 10    36 UAT             "World"      
    #> 11    58 UAT             "World"
    
    read_csv("./data/text.csv", col_types = c("cn_cc"))
    #> # A tibble: 11 x 4
    #>    StudyName      age SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>         <chr>     
    #>  1 BGB-3111-217    56 "World"       UATU0001  
    #>  2 BGB-3111-217    65 "World"       <NA>      
    #>  3 BGB-3111-217    45 "\"\""        UATU0001  
    #>  4 BGB-3111-217    65  <NA>         UATU0001  
    #>  5 BGB-3111-217    21 "World"       UATU0001  
    #>  6 <NA>            NA  <NA>         <NA>      
    #>  7 BGB-3111-217    54 "World"       UATU#0001 
    #>  8 BGB-3111-217    78 "World"       UATU0001  
    #>  9 BGB-3111-217    89 "World"       UATU0001  
    #> 10 BGB-3111-217    36 "World"       #UATU0001 
    #> 11 BGB-3111-217    58 "World"       UATU0001
    
    read_csv("./data/text.csv", col_types = list(.default = col_character()))
    #> # A tibble: 11 x 5
    #>    StudyName    age   EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <chr> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217 56    UAT             "World"       UATU0001  
    #>  2 BGB-3111-217 65    .               "World"       <NA>      
    #>  3 BGB-3111-217 45    UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217 65    UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217 21    UAT             "World"       UATU0001  
    #>  6 <NA>         <NA>  <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217 54    UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217 78    UAT             "World"       UATU0001  
    #>  9 BGB-3111-217 89    UAT             "World"       UATU0001  
    #> 10 BGB-3111-217 36    UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217 58    UAT             "World"       UATU0001
    
    read_csv("./data/text.csv", col_types = cols())
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #>  2 BGB-3111-217    65 .               "World"       <NA>      
    #>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
    #>  6 <NA>            NA <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
    #>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
    #> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217    58 UAT             "World"       UATU0001
  4. na: Character vector of strings to interpret as missing values – c('', 'NA')

    read_csv("./data/text.csv", na = c("", "NA", "."))
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #>  2 BGB-3111-217    65 <NA>            "World"       <NA>      
    #>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
    #>  6 <NA>            NA <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
    #>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
    #>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
    #> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
    #> 11 BGB-3111-217    58 UAT             "World"       UATU0001
  5. comment: A string used to identify comments. Any text after the comment characters will be silently ignored.

    read_csv("./data/text.csv", comment = "#")
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> Warning: 1 parsing failure.
    #> row col  expected    actual              file
    #>  10  -- 5 columns 4 columns './data/text.csv'
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>         <chr>     
    #>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #>  2 BGB-3111-217    65 .               "World"       <NA>      
    #>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
    #>  6 <NA>            NA <NA>             <NA>         <NA>      
    #>  7 BGB-3111-217    54 UAT             "World"       UATU      
    #>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
    #>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
    #> 10 BGB-3111-217    36 UAT             "World"       <NA>      
    #> 11 BGB-3111-217    58 UAT             "World"       UATU0001
  6. trim_ws: Should leading and trailing whitespace be trimmed from each field before parsing it?

    read_csv("./data/text.csv", trim_ws = FALSE)
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> # A tibble: 11 x 5
    #>    StudyName      age EnvironmentName SiteGroupName  SiteNumber
    #>    <chr>        <dbl> <chr>           <chr>          <chr>     
    #>  1 BGB-3111-217    56 "UAT"           "World"        UATU0001  
    #>  2 BGB-3111-217    65 "."             "World"        <NA>      
    #>  3 BGB-3111-217    45 "UAT"           "\"\""         UATU0001  
    #>  4 BGB-3111-217    65 "  UAT"          <NA>          UATU0001  
    #>  5 BGB-3111-217    21 "UAT"           "  World     " UATU0001  
    #>  6 <NA>            NA  <NA>            <NA>          <NA>      
    #>  7 BGB-3111-217    54 "UAT"           "World"        UATU#0001 
    #>  8 BGB-3111-217    78 "UAT"           "World"        UATU0001  
    #>  9 BGB-3111-217    89 "UAT"           "World"        UATU0001  
    #> 10 BGB-3111-217    36 "UAT"           "World"        #UATU0001 
    #> 11 BGB-3111-217    58 "UAT"           "World"        UATU0001
  7. skip: Number of lines to skip before reading data.

    read_csv("./data/text.csv", skip = 2)
    #> Warning: Missing column names filled in: 'X5' [5]
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   `BGB-3111-217` = col_character(),
    #>   `65` = col_double(),
    #>   . = col_character(),
    #>   World = col_character(),
    #>   X5 = col_character()
    #> )
    #> # A tibble: 9 x 5
    #>   `BGB-3111-217`  `65` .     World   X5       
    #>   <chr>          <dbl> <chr> <chr>   <chr>    
    #> 1 BGB-3111-217      45 UAT   "\"\""  UATU0001 
    #> 2 BGB-3111-217      65 UAT    <NA>   UATU0001 
    #> 3 BGB-3111-217      21 UAT   "World" UATU0001 
    #> 4 <NA>              NA <NA>   <NA>   <NA>     
    #> 5 BGB-3111-217      54 UAT   "World" UATU#0001
    #> 6 BGB-3111-217      78 UAT   "World" UATU0001 
    #> 7 BGB-3111-217      89 UAT   "World" UATU0001 
    #> 8 BGB-3111-217      36 UAT   "World" #UATU0001
    #> 9 BGB-3111-217      58 UAT   "World" UATU0001
  8. n_max: Maximum number of records to read.

    read_csv("./data/text.csv", n_max = 5)
    #> 
    #> ── Column specification ────────────────────────────────────────────────────────
    #> cols(
    #>   StudyName = col_character(),
    #>   age = col_double(),
    #>   EnvironmentName = col_character(),
    #>   SiteGroupName = col_character(),
    #>   SiteNumber = col_character()
    #> )
    #> # A tibble: 5 x 5
    #>   StudyName      age EnvironmentName SiteGroupName SiteNumber
    #>   <chr>        <dbl> <chr>           <chr>         <chr>     
    #> 1 BGB-3111-217    56 UAT             "World"       UATU0001  
    #> 2 BGB-3111-217    65 .               "World"       <NA>      
    #> 3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
    #> 4 BGB-3111-217    65 UAT              <NA>         UATU0001  
    #> 5 BGB-3111-217    21 UAT             "World"       UATU0001
  9. locale: The locale controls defaults that vary from place to place

    locale()
    #> <locale>
    #> Numbers:  123,456.78
    #> Formats:  %AD / %AT
    #> Timezone: UTC
    #> Encoding: UTF-8
    #> <date_names>
    #> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
    #>         (Thu), Friday (Fri), Saturday (Sat)
    #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
    #>         June (Jun), July (Jul), August (Aug), September (Sep), October
    #>         (Oct), November (Nov), December (Dec)
    #> AM/PM:  AM/PM
    
    locale(
      date_names = "en",
      date_format = "%AD",
      time_format = "%AT",
      decimal_mark = ".",
      grouping_mark = ",",
      tz = "UTC",
      encoding = "UTF-8",
      asciify = FALSE
    )
    #> <locale>
    #> Numbers:  123,456.78
    #> Formats:  %AD / %AT
    #> Timezone: UTC
    #> Encoding: UTF-8
    #> <date_names>
    #> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
    #>         (Thu), Friday (Fri), Saturday (Sat)
    #> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
    #>         June (Jun), July (Jul), August (Aug), September (Sep), October
    #>         (Oct), November (Nov), December (Dec)
    #> AM/PM:  AM/PM
  10. guess_max: Maximum number of records to use for guessing column types.

  11. quoted_na: Should missing values inside quotes be treated as missing values (the default) or strings

Import xls and xlsx files: readxl::read_xlsx

readxl package

library(readxl)
Functions Usage
read_excel Read xls and xlsx files
read_xlsx Read xlsx files
read_xls Read xls files
excel_sheet List all sheets in an excel spreadsheet
excel_format Determine if files are xls or xlsx

excel_sheet(), excel_format()

loc <-  c("./data/import_test.xlsx")
a <- excel_sheets(loc)
a
#> [1] "Sheet1" "Sheet2"

b <- excel_format(loc)
b
#> [1] "xlsx"

read_xlsx

Arguments Default Option
sheet 1 "sheetname", 2
range full "A1:D15", "Sheet!A1:B14"
cell_limits(c(4, 3), c(NA, NA))
cell_rows()
cell_cols()
col_names TRUE TRUE/FALSE(x1-xn)
character vector of column names
col_types cols(var1 = col_character())
cols_only(X1 = col_character())
cols(var1 = col_skip())
c("ic\_") c('text', 'guess', 'numeric')
col_types = "text"
na c("", "NA") a vector, c(" ", "NA", ".")
n_max Inf integer
skip 0 integer
guess_max Inf or 1000 integer
trim_ws TRUE TRUE/FALSE
.name_repair "unique" default or customize
  1. sheet: Sheet to read – "sheetname" or an integer (the position of the sheet).

    read_xlsx("./data/import_test.xlsx", sheet = 1)
    #> # A tibble: 6 x 8
    #>   Subject Sex      Age Height `Wei-ght` Score quote_escape eol        
    #>   <chr>   <chr>  <dbl>  <dbl>     <dbl> <dbl> <chr>        <chr>      
    #> 1 001     Female    26    165        64    89 "\"aa\""     "aaa\\nbbb"
    #> 2 002     Female    28    170        67    78 "\"65\""     "aaa\\bbb" 
    #> 3 003     Male      30    181        80    64 "\"aa\""     "aaanbbb"  
    #> 4 004     Male      25    160        60    87 "\"65\""     "ccc\\nbbb"
    #> 5 005     Female    36    175        60    57 "\"aa\""     "ccc\\bbb" 
    #> 6 006     Male      19    163        68    64 "\"65\""     "cccnbbb"
  2. range: A cell range to read from

    read_xlsx("./data/import_test.xlsx", range = "A1:D5")
    #> # A tibble: 4 x 4
    #>   Subject Sex      Age Height
    #>   <chr>   <chr>  <dbl>  <dbl>
    #> 1 001     Female    26    165
    #> 2 002     Female    28    170
    #> 3 003     Male      30    181
    #> 4 004     Male      25    160
    
    read_xlsx("./data/import_test.xlsx", range = cell_limits(c(4, 3), c(NA, NA)))
    #> # A tibble: 3 x 6
    #>    `30` `181`  `80`  `64` `"aa"`   aaanbbb    
    #>   <dbl> <dbl> <dbl> <dbl> <chr>    <chr>      
    #> 1    25   160    60    87 "\"65\"" "ccc\\nbbb"
    #> 2    36   175    60    57 "\"aa\"" "ccc\\bbb" 
    #> 3    19   163    68    64 "\"65\"" "cccnbbb"
    
    read_xlsx("./data/import_test.xlsx", range = cell_rows(1:3))
    #> # A tibble: 2 x 8
    #>   Subject Sex      Age Height `Wei-ght` Score quote_escape eol        
    #>   <chr>   <chr>  <dbl>  <dbl>     <dbl> <dbl> <chr>        <chr>      
    #> 1 001     Female    26    165        64    89 "\"aa\""     "aaa\\nbbb"
    #> 2 002     Female    28    170        67    78 "\"65\""     "aaa\\bbb"
    
    read_xlsx("./data/import_test.xlsx", range = cell_cols(1:5))
    #> # A tibble: 6 x 5
    #>   Subject Sex      Age Height `Wei-ght`
    #>   <chr>   <chr>  <dbl>  <dbl>     <dbl>
    #> 1 001     Female    26    165        64
    #> 2 002     Female    28    170        67
    #> 3 003     Male      30    181        80
    #> 4 004     Male      25    160        60
    #> 5 005     Female    36    175        60
    #> 6 006     Male      19    163        68
  3. col_types: Either NULL to guess all from the spreadsheet or a character vector containing one entry per column from these options: “skip”, “guess”, “logical”, “numeric”, “date”, “text” or “list”. If exactly one col_type is specified, it will be recycled.

    read_xlsx("./data/import_test.xlsx", col_types = "text")
    #> # A tibble: 6 x 8
    #>   Subject Sex    Age   Height `Wei-ght` Score quote_escape eol        
    #>   <chr>   <chr>  <chr> <chr>  <chr>     <chr> <chr>        <chr>      
    #> 1 001     Female 26    165    64        89    "\"aa\""     "aaa\\nbbb"
    #> 2 002     Female 28    170    67        78    "\"65\""     "aaa\\bbb" 
    #> 3 003     Male   30    181    80        64    "\"aa\""     "aaanbbb"  
    #> 4 004     Male   25    160    60        87    "\"65\""     "ccc\\nbbb"
    #> 5 005     Female 36    175    60        57    "\"aa\""     "ccc\\bbb" 
    #> 6 006     Male   19    163    68        64    "\"65\""     "cccnbbb"
  4. .name_repair: Handling of column names. By default, readxl ensures column names are not empty and are unique.

    my_custom_name_repair <- function(nms) {tolower(gsub("[.-]", "_", nms))}
    read_xlsx("./data/import_test.xlsx", .name_repair= my_custom_name_repair)
    #> # A tibble: 6 x 8
    #>   subject sex      age height wei_ght score quote_escape eol        
    #>   <chr>   <chr>  <dbl>  <dbl>   <dbl> <dbl> <chr>        <chr>      
    #> 1 001     Female    26    165      64    89 "\"aa\""     "aaa\\nbbb"
    #> 2 002     Female    28    170      67    78 "\"65\""     "aaa\\bbb" 
    #> 3 003     Male      30    181      80    64 "\"aa\""     "aaanbbb"  
    #> 4 004     Male      25    160      60    87 "\"65\""     "ccc\\nbbb"
    #> 5 005     Female    36    175      60    57 "\"aa\""     "ccc\\bbb" 
    #> 6 006     Male      19    163      68    64 "\"65\""     "cccnbbb"

Import SAS files into R: haven::read_sas

library(haven)
Arguments Default Option
catalog_file NULL
encoding NULL
catalog_encoding NULL
n_max Inf integer
skip 0 integer
col_select NULL a vector, c('subject', 'age')
.name_repair "unique" default or customize
  1. col_select: One or more selection expressions

    sas <- read_sas("./data/training.sas7bdat", col_select = c("Age", "Score", "quote_escape"))
    sas
    #> # A tibble: 6 x 3
    #>     Age Score quote_escape
    #>   <dbl> <dbl> <chr>       
    #> 1    26    89 "\"aa\""    
    #> 2    28    78 "\"65\""    
    #> 3    30    64 "\"aa\""    
    #> 4    25    87 "\"65\""    
    #> 5    36    57 "\"aa\""    
    #> 6    19    64 "\"65\""

Data Output

Write a data frame to a comma-delimited file: readr::write_csv

Arguments Default Option
file File or connection
delim ',', ';', '/'
na "NA" string
append FALSE FALSE/TRUE
col_names TRUE FALSE/TRUE
quote_escape "double" "double", "backslash", "none"
eol "\n"
  1. append: If FALSE, will overwrite existing file. If TRUE, will append to existing file. In both cases, if the file does not exist a new file is created.

    getwd()
    if (!dir.exists('./output')) {dir.create('./output')}
    
    write_csv(sas, "./output/output.csv", append = FALSE)
    write_csv(sas, "./output/output.csv", append = TRUE) 
  2. col_names: If FALSE, column names will not be included at the top of the file. If TRUE, column names will be included. If not specified, col_names will take the opposite value given to append.

    write_csv(sas, "./output/output.csv", col_names = FALSE)
    write_csv(sas, "./output/output.csv", col_names = TRUE)
  3. eol: The end of line character to use. Most commonly either "\n" for Unix style newlines, or "\r\n" for Windows style newlines.

    write_csv(sas, "./output/output.csv", eol = " a\n")
  4. quote_escape: The type of escaping to use for quoted values, one of "double", "backslash" or "none".

    write_csv(sas, "./output/output.csv", quote_escape = "none")

Write a data frame to a excel file: openxlsx::write.xlsx

Arguments Default Option
asTable FALSE
creator NULL string
sheetName NULL string
gridLines TRUE TRUE/FALSE
tabColour NULL yellow
zoom 100 10400
startCol 1 vector or numberic
startRow 1 vector or numberic
xy c(1, 1) vector
colNames/col.names TRUE TRUE/FALSE
rowNames/row.names FALSE TRUE/FALSE
headerStyle NULL custom by createstyle()
borders NULL "surrounding", "columns" or "rows" or NULL
borderColour NULL color
borderStyle NULL custom by createstyle()
keepNA FALSE TRUE/FALSE
na.string NULL string
firstActiveRow 1 numeric
firstActiveCol 1 numeric
firstRow FALSE TRUE/FALSE
firstCol FALSE TRUE/FALSE
colWidths NA "auto"
overwrite TRUE TRUE/FALSE
  1. asTable: whether with border and filter button, FALSE by default

  2. creator: A string specifying the workbook author

  3. sheetName: Name of the worksheet

  4. gridLines: A logical. If FALSE, the worksheet grid lines will be hidden.

  5. tabColour: Colour of the worksheet tab

  6. zoom: A numeric between 10 and 400. Worksheet zoom level as a percentage.

    write.xlsx(
      sas,
      "./output/output.xlsx",
      asTable = TRUE,
      creator = "yuhang",
      sheetName = 'name',
      gridLines = TRUE,
      tabColour = 'yellow',
      zoom = 200
    )
  7. startCol: A vector specifying the starting column(s) to write df

  8. startRow: A vector specifying the starting row(s) to write df

  9. xy: An alternative to specifying startCol and startRow individually. A vector of the form c(startCol, startRow)

  10. colNames/col.names: If TRUE, column names of x are written.

  11. rowNames/row.names: If TRUE, row names of x are written.

  12. headerStyle: Custom style to apply to column names.

  13. borders: Either "surrounding", "columns" or "rows" or NULL.

  14. borderColour: Colour of cell border, only borders is not NULL

  15. borderStyle: Border line style, customize by createstyle()

    write.xlsx(
      sas,
      "./output/output.xlsx",
      startCol = 4, startRow = 3,
      # xy = c(2, 3),
      col.names = FALSE,
      rowNames = TRUE,
      borders = "surrounding", borderColour = "blue"
    )
  16. keepNA: If TRUE, NA values are converted to #N/A (or na.string, if not NULL) in Excel, else NA cells will be empty. Defaults to FALSE.

  17. na.string: If not NULL, and if keepNA is TRUE, NA values are converted to this string in Excel. Defaults to NULL.

  18. firstActiveRow: Top row of active region to freeze pane, begin at 1.

  19. firstActiveCol: Furthest left column of active region to freeze pane, begin at 1.

  20. firstRow: If TRUE, freezes the first row (equivalent to firstActiveRow = 2).

  21. firstCol: If TRUE, freezes the first column (equivalent to firstActiveCol = 2).

  22. colWidths: Must be value "auto". Sets all columns containing data to auto width.

  23. overwrite: Overwrite existing file (Defaults to TRUE as with write.table)

    write.xlsx(
      sas,
      "./output/output.xlsx",
      firstActiveRow = 3, firstActiveCol = 3,
      # firstRow = TRUE, firstCol = TRUE,
      colWidths= "auto"
    )

Open more than one dataframe into different sheets

l <- list(sas, sas, sas)
write.xlsx(
  l,
  "./output/writeList1.xlsx",
  sheetName = c('a', 'b', 'c'),
  colWidths = c(NA, "auto", "auto"),
  startCol = c(1, 2, 3),
  startRow = 2,
  asTable = c(TRUE, TRUE, FALSE),
  withFilter = c(TRUE, FALSE, FALSE)
)

File/folder operations

Folder operations

  • dir.exists(): judge whether location exist, return TRUE or FALSE

  • dir.create(): create a path

  • unlink(): deletes the file(s) or directories

base::dir.exists("./output/test")
dir.create("./output/test/r", recursive = TRUE)
unlink("./output/test/r", recursive = TRUE)
dir.create("./output/test")
base::dir.exists("./output/test")
unlink("./output/test")

File operations

Functions Usage
file.create creates files with the given names
file.exists indicating whether the files exist
file.remove remove the files
file.rename rename files
file.copy copy files
dir/list.files list the names of files or directories in the named directory
file.create("./output/test/r/read.txt")
file.exists("./output/test/r/read.txt")
file.rename('./output/test/r/read.txt', './output/test/r/read2.txt')
file.copy('./output/test/r/read2.txt', './output/test/r/read.txt')
dir('./output/test/r')
file.remove('./output/test/r/read2.txt')