Read files into R

library(tidyverse)

You can download all the supported files: text.csv text.zip import_test.xlsx training.sas7bdat and put them in your folder: ./data/

Import a delimited file (including csv & tsv) into a tibble

readr package

Functions	Usage
read_csv	Read a comma-delimited file
read_csv2	Read a semicolon-delimited file
read_tsv	Read a tab-delimited file
read_delim	Read a general-delimited file
read_fwf	Read a fixed width file
read_table	Read whitespace-separated columns
read_log	Read common/combined log file

Read a comma-delimited file: read_csv

Arguments	Default	Option
file	location	a file, a connection, or literal data
col_names	`TRUE`	`TRUE`/`FALSE`(x1-xn)
		character vector of column names
col_types		`cols(var1 = col_character())`
		`cols_only(X1 = col_character())`
		`cols(var1 = col_skip())`
		`c("ic_")`
		`col_types = list(.default = col_character())`
locale	`default_locale()`	default or customize
na	`c("", "NA")`	a vector, `c(" ", "NA", ".")`
quote	`"\""`	`"\""`
n_max	`Inf`	integer
skip	`0`	integer
guess_max	`Inf or 1000`	integer
trim_ws	`TRUE`	`TRUE`/`FALSE`
skip_empty_rows	`TRUE`	`TRUE`/`FALSE`

Arguments

file: A path, a connection, or literal data

read_csv("x,y\n1,2\n3,4")
#> # A tibble: 2 x 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2
#> 2     3     4

read_csv("./data/text.csv")
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
#>  2 BGB-3111-217    65 .               "World"       <NA>      
#>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
#>  6 <NA>            NA <NA>             <NA>         <NA>      
#>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
#>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
#>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
#> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
#> 11 BGB-3111-217    58 UAT             "World"       UATU0001

read_csv("./data/text.zip")
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   .default = col_character(),
#>   SiteID = col_double(),
#>   SubjectID = col_double(),
#>   CRFVersion = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.
#> # A tibble: 240 x 20
#>    StudyName  EnvironmentName SiteGroupName SiteID SiteNumber SiteName SubjectID
#>    <chr>      <chr>           <chr>          <dbl> <chr>      <chr>        <dbl>
#>  1 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    258577
#>  2 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    303338
#>  3 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    305387
#>  4 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    309326
#>  5 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    312356
#>  6 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    320585
#>  7 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    362433
#>  8 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    363219
#>  9 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    316194
#> 10 BGB-3111-… UAT             World           1721 UATU0001   BGB UAT…    316250
#> 11 BGB-3111-… UAT             World           1722 UATU0002   BGB UAT…    259154
#> # … with 229 more rows, and 13 more variables: SubjectName <chr>,
#> #   CRFVersion <dbl>, SubjectStatus <chr>, Created <chr>, ParamStudy <chr>,
#> #   ParamEnvironment <chr>, ParamSiteGroup <chr>, ParamSites <chr>,
#> #   ParamSubjects <chr>, Login <chr>, RoleName <chr>, PrintDateTime <chr>,
#> #   TimeZone <chr>

read_csv("https://github.com/tidyverse/readr/raw/master/inst/extdata/mtcars.csv")
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )
#> # A tibble: 32 x 11
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
#> 11  17.8     6  168.   123  3.92  3.44  18.9     1     0     4     4
#> # … with 21 more rows

col_names: Define var names–either TRUE, FALSE or a character vector of column names, TRUE by default.

read_csv("./data/text.csv", col_names = TRUE)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
#>  2 BGB-3111-217    65 .               "World"       <NA>      
#>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
#>  6 <NA>            NA <NA>             <NA>         <NA>      
#>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
#>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
#>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
#> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
#> 11 BGB-3111-217    58 UAT             "World"       UATU0001

read_csv("./data/text.csv", col_names = FALSE)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_character(),
#>   X4 = col_character(),
#>   X5 = col_character()
#> )
#> # A tibble: 12 x 5
#>    X1           X2    X3              X4              X5        
#>    <chr>        <chr> <chr>           <chr>           <chr>     
#>  1 StudyName    age   EnvironmentName "SiteGroupName" SiteNumber
#>  2 BGB-3111-217 56    UAT             "World"         UATU0001  
#>  3 BGB-3111-217 65    .               "World"         <NA>      
#>  4 BGB-3111-217 45    UAT             "\"\""          UATU0001  
#>  5 BGB-3111-217 65    UAT              <NA>           UATU0001  
#>  6 BGB-3111-217 21    UAT             "World"         UATU0001  
#>  7 <NA>         <NA>  <NA>             <NA>           <NA>      
#>  8 BGB-3111-217 54    UAT             "World"         UATU#0001 
#>  9 BGB-3111-217 78    UAT             "World"         UATU0001  
#> 10 BGB-3111-217 89    UAT             "World"         UATU0001  
#> 11 BGB-3111-217 36    UAT             "World"         #UATU0001 
#> # … with 1 more row

read_csv("./data/text.csv", col_names = c("var1", "var2", "var3"))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   var1 = col_character(),
#>   var2 = col_character(),
#>   var3 = col_character()
#> )
#> Warning: 12 parsing failures.
#> row col  expected    actual              file
#>   1  -- 3 columns 5 columns './data/text.csv'
#>   2  -- 3 columns 5 columns './data/text.csv'
#>   3  -- 3 columns 5 columns './data/text.csv'
#>   4  -- 3 columns 5 columns './data/text.csv'
#>   5  -- 3 columns 5 columns './data/text.csv'
#> ... ... ......... ......... .................
#> See problems(...) for more details.
#> # A tibble: 12 x 3
#>    var1         var2  var3           
#>    <chr>        <chr> <chr>          
#>  1 StudyName    age   EnvironmentName
#>  2 BGB-3111-217 56    UAT            
#>  3 BGB-3111-217 65    .              
#>  4 BGB-3111-217 45    UAT            
#>  5 BGB-3111-217 65    UAT            
#>  6 BGB-3111-217 21    UAT            
#>  7 <NA>         <NA>  <NA>           
#>  8 BGB-3111-217 54    UAT            
#>  9 BGB-3111-217 78    UAT            
#> 10 BGB-3111-217 89    UAT            
#> 11 BGB-3111-217 36    UAT            
#> # … with 1 more row

col_types: Define var types–NULL, a cols() specification, or a string, “GUESS” by default.

c = character	i = integer	n = number
d = double	l = logical	f = factor
D = date	T = date time	t = time
? = guess	_ or - = skip

read_csv("./data/text.csv", col_types = cols(age = col_character()))
#> # A tibble: 11 x 5
#>    StudyName    age   EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <chr> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217 56    UAT             "World"       UATU0001  
#>  2 BGB-3111-217 65    .               "World"       <NA>      
#>  3 BGB-3111-217 45    UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217 65    UAT              <NA>         UATU0001  
#>  5 BGB-3111-217 21    UAT             "World"       UATU0001  
#>  6 <NA>         <NA>  <NA>             <NA>         <NA>      
#>  7 BGB-3111-217 54    UAT             "World"       UATU#0001 
#>  8 BGB-3111-217 78    UAT             "World"       UATU0001  
#>  9 BGB-3111-217 89    UAT             "World"       UATU0001  
#> 10 BGB-3111-217 36    UAT             "World"       #UATU0001 
#> 11 BGB-3111-217 58    UAT             "World"       UATU0001

read_csv("./data/text.csv", col_types = cols_only(StudyName = col_character(), SiteNumber = col_character()))
#> # A tibble: 11 x 2
#>    StudyName    SiteNumber
#>    <chr>        <chr>     
#>  1 BGB-3111-217 UATU0001  
#>  2 BGB-3111-217 <NA>      
#>  3 BGB-3111-217 UATU0001  
#>  4 BGB-3111-217 UATU0001  
#>  5 BGB-3111-217 UATU0001  
#>  6 <NA>         <NA>      
#>  7 BGB-3111-217 UATU#0001 
#>  8 BGB-3111-217 UATU0001  
#>  9 BGB-3111-217 UATU0001  
#> 10 BGB-3111-217 #UATU0001 
#> 11 BGB-3111-217 UATU0001

read_csv("./data/text.csv", col_types = cols(SiteNumber = col_skip(), StudyName = col_skip()))
#> # A tibble: 11 x 3
#>      age EnvironmentName SiteGroupName
#>    <dbl> <chr>           <chr>        
#>  1    56 UAT             "World"      
#>  2    65 .               "World"      
#>  3    45 UAT             "\"\""       
#>  4    65 UAT              <NA>        
#>  5    21 UAT             "World"      
#>  6    NA <NA>             <NA>        
#>  7    54 UAT             "World"      
#>  8    78 UAT             "World"      
#>  9    89 UAT             "World"      
#> 10    36 UAT             "World"      
#> 11    58 UAT             "World"

read_csv("./data/text.csv", col_types = c("cn_cc"))
#> # A tibble: 11 x 4
#>    StudyName      age SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>         <chr>     
#>  1 BGB-3111-217    56 "World"       UATU0001  
#>  2 BGB-3111-217    65 "World"       <NA>      
#>  3 BGB-3111-217    45 "\"\""        UATU0001  
#>  4 BGB-3111-217    65  <NA>         UATU0001  
#>  5 BGB-3111-217    21 "World"       UATU0001  
#>  6 <NA>            NA  <NA>         <NA>      
#>  7 BGB-3111-217    54 "World"       UATU#0001 
#>  8 BGB-3111-217    78 "World"       UATU0001  
#>  9 BGB-3111-217    89 "World"       UATU0001  
#> 10 BGB-3111-217    36 "World"       #UATU0001 
#> 11 BGB-3111-217    58 "World"       UATU0001

read_csv("./data/text.csv", col_types = list(.default = col_character()))
#> # A tibble: 11 x 5
#>    StudyName    age   EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <chr> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217 56    UAT             "World"       UATU0001  
#>  2 BGB-3111-217 65    .               "World"       <NA>      
#>  3 BGB-3111-217 45    UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217 65    UAT              <NA>         UATU0001  
#>  5 BGB-3111-217 21    UAT             "World"       UATU0001  
#>  6 <NA>         <NA>  <NA>             <NA>         <NA>      
#>  7 BGB-3111-217 54    UAT             "World"       UATU#0001 
#>  8 BGB-3111-217 78    UAT             "World"       UATU0001  
#>  9 BGB-3111-217 89    UAT             "World"       UATU0001  
#> 10 BGB-3111-217 36    UAT             "World"       #UATU0001 
#> 11 BGB-3111-217 58    UAT             "World"       UATU0001

read_csv("./data/text.csv", col_types = cols())
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
#>  2 BGB-3111-217    65 .               "World"       <NA>      
#>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
#>  6 <NA>            NA <NA>             <NA>         <NA>      
#>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
#>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
#>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
#> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
#> 11 BGB-3111-217    58 UAT             "World"       UATU0001

na: Character vector of strings to interpret as missing values – c('', 'NA')

read_csv("./data/text.csv", na = c("", "NA", "."))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
#>  2 BGB-3111-217    65 <NA>            "World"       <NA>      
#>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
#>  6 <NA>            NA <NA>             <NA>         <NA>      
#>  7 BGB-3111-217    54 UAT             "World"       UATU#0001 
#>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
#>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
#> 10 BGB-3111-217    36 UAT             "World"       #UATU0001 
#> 11 BGB-3111-217    58 UAT             "World"       UATU0001

comment: A string used to identify comments. Any text after the comment characters will be silently ignored.

read_csv("./data/text.csv", comment = "#")
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> Warning: 1 parsing failure.
#> row col  expected    actual              file
#>  10  -- 5 columns 4 columns './data/text.csv'
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName SiteNumber
#>    <chr>        <dbl> <chr>           <chr>         <chr>     
#>  1 BGB-3111-217    56 UAT             "World"       UATU0001  
#>  2 BGB-3111-217    65 .               "World"       <NA>      
#>  3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#>  4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#>  5 BGB-3111-217    21 UAT             "World"       UATU0001  
#>  6 <NA>            NA <NA>             <NA>         <NA>      
#>  7 BGB-3111-217    54 UAT             "World"       UATU      
#>  8 BGB-3111-217    78 UAT             "World"       UATU0001  
#>  9 BGB-3111-217    89 UAT             "World"       UATU0001  
#> 10 BGB-3111-217    36 UAT             "World"       <NA>      
#> 11 BGB-3111-217    58 UAT             "World"       UATU0001

trim_ws: Should leading and trailing whitespace be trimmed from each field before parsing it?

read_csv("./data/text.csv", trim_ws = FALSE)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> # A tibble: 11 x 5
#>    StudyName      age EnvironmentName SiteGroupName  SiteNumber
#>    <chr>        <dbl> <chr>           <chr>          <chr>     
#>  1 BGB-3111-217    56 "UAT"           "World"        UATU0001  
#>  2 BGB-3111-217    65 "."             "World"        <NA>      
#>  3 BGB-3111-217    45 "UAT"           "\"\""         UATU0001  
#>  4 BGB-3111-217    65 "  UAT"          <NA>          UATU0001  
#>  5 BGB-3111-217    21 "UAT"           "  World     " UATU0001  
#>  6 <NA>            NA  <NA>            <NA>          <NA>      
#>  7 BGB-3111-217    54 "UAT"           "World"        UATU#0001 
#>  8 BGB-3111-217    78 "UAT"           "World"        UATU0001  
#>  9 BGB-3111-217    89 "UAT"           "World"        UATU0001  
#> 10 BGB-3111-217    36 "UAT"           "World"        #UATU0001 
#> 11 BGB-3111-217    58 "UAT"           "World"        UATU0001

skip: Number of lines to skip before reading data.

read_csv("./data/text.csv", skip = 2)
#> Warning: Missing column names filled in: 'X5' [5]
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   `BGB-3111-217` = col_character(),
#>   `65` = col_double(),
#>   . = col_character(),
#>   World = col_character(),
#>   X5 = col_character()
#> )
#> # A tibble: 9 x 5
#>   `BGB-3111-217`  `65` .     World   X5       
#>   <chr>          <dbl> <chr> <chr>   <chr>    
#> 1 BGB-3111-217      45 UAT   "\"\""  UATU0001 
#> 2 BGB-3111-217      65 UAT    <NA>   UATU0001 
#> 3 BGB-3111-217      21 UAT   "World" UATU0001 
#> 4 <NA>              NA <NA>   <NA>   <NA>     
#> 5 BGB-3111-217      54 UAT   "World" UATU#0001
#> 6 BGB-3111-217      78 UAT   "World" UATU0001 
#> 7 BGB-3111-217      89 UAT   "World" UATU0001 
#> 8 BGB-3111-217      36 UAT   "World" #UATU0001
#> 9 BGB-3111-217      58 UAT   "World" UATU0001

n_max: Maximum number of records to read.

read_csv("./data/text.csv", n_max = 5)
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   StudyName = col_character(),
#>   age = col_double(),
#>   EnvironmentName = col_character(),
#>   SiteGroupName = col_character(),
#>   SiteNumber = col_character()
#> )
#> # A tibble: 5 x 5
#>   StudyName      age EnvironmentName SiteGroupName SiteNumber
#>   <chr>        <dbl> <chr>           <chr>         <chr>     
#> 1 BGB-3111-217    56 UAT             "World"       UATU0001  
#> 2 BGB-3111-217    65 .               "World"       <NA>      
#> 3 BGB-3111-217    45 UAT             "\"\""        UATU0001  
#> 4 BGB-3111-217    65 UAT              <NA>         UATU0001  
#> 5 BGB-3111-217    21 UAT             "World"       UATU0001

locale: The locale controls defaults that vary from place to place

locale()
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#>         (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#>         June (Jun), July (Jul), August (Aug), September (Sep), October
#>         (Oct), November (Nov), December (Dec)
#> AM/PM:  AM/PM

locale(
  date_names = "en",
  date_format = "%AD",
  time_format = "%AT",
  decimal_mark = ".",
  grouping_mark = ",",
  tz = "UTC",
  encoding = "UTF-8",
  asciify = FALSE
)
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#>         (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#>         June (Jun), July (Jul), August (Aug), September (Sep), October
#>         (Oct), November (Nov), December (Dec)
#> AM/PM:  AM/PM

guess_max: Maximum number of records to use for guessing column types.
quoted_na: Should missing values inside quotes be treated as missing values (the default) or strings

Import xls and xlsx files： readxl::read_xlsx

readxl package

library(readxl)

Functions	Usage
read_excel	Read xls and xlsx files
read_xlsx	Read xlsx files
read_xls	Read xls files
excel_sheet	List all sheets in an excel spreadsheet
excel_format	Determine if files are xls or xlsx

`excel_sheet()`, `excel_format()`

loc <-  c("./data/import_test.xlsx")
a <- excel_sheets(loc)
a
#> [1] "Sheet1" "Sheet2"

b <- excel_format(loc)
b
#> [1] "xlsx"

read_xlsx

Arguments	Default	Option
sheet	`1`	`"sheetname"`, `2`
range	full	`"A1:D15"`, "`Sheet!A1:B14"`
		`cell_limits(c(4, 3), c(NA, NA))`
		`cell_rows()`
		`cell_cols()`
col_names	`TRUE`	`TRUE`/`FALSE`(x1-xn)
		character vector of column names
col_types		`cols(var1 = col_character())`
		`cols_only(X1 = col_character())`
		`cols(var1 = col_skip())`
		`c("ic\_")` `c('text', 'guess', 'numeric')`
		`col_types = "text"`
na	`c("", "NA")`	a vector, `c(" ", "NA", ".")`
n_max	`Inf`	integer
skip	`0`	integer
guess_max	`Inf or 1000`	integer
trim_ws	`TRUE`	`TRUE`/`FALSE`
.name_repair	`"unique"`	default or customize

sheet: Sheet to read – "sheetname" or an integer (the position of the sheet).

read_xlsx("./data/import_test.xlsx", sheet = 1)
#> # A tibble: 6 x 8
#>   Subject Sex      Age Height `Wei-ght` Score quote_escape eol        
#>   <chr>   <chr>  <dbl>  <dbl>     <dbl> <dbl> <chr>        <chr>      
#> 1 001     Female    26    165        64    89 "\"aa\""     "aaa\\nbbb"
#> 2 002     Female    28    170        67    78 "\"65\""     "aaa\\bbb" 
#> 3 003     Male      30    181        80    64 "\"aa\""     "aaanbbb"  
#> 4 004     Male      25    160        60    87 "\"65\""     "ccc\\nbbb"
#> 5 005     Female    36    175        60    57 "\"aa\""     "ccc\\bbb" 
#> 6 006     Male      19    163        68    64 "\"65\""     "cccnbbb"

range: A cell range to read from

read_xlsx("./data/import_test.xlsx", range = "A1:D5")
#> # A tibble: 4 x 4
#>   Subject Sex      Age Height
#>   <chr>   <chr>  <dbl>  <dbl>
#> 1 001     Female    26    165
#> 2 002     Female    28    170
#> 3 003     Male      30    181
#> 4 004     Male      25    160

read_xlsx("./data/import_test.xlsx", range = cell_limits(c(4, 3), c(NA, NA)))
#> # A tibble: 3 x 6
#>    `30` `181`  `80`  `64` `"aa"`   aaanbbb    
#>   <dbl> <dbl> <dbl> <dbl> <chr>    <chr>      
#> 1    25   160    60    87 "\"65\"" "ccc\\nbbb"
#> 2    36   175    60    57 "\"aa\"" "ccc\\bbb" 
#> 3    19   163    68    64 "\"65\"" "cccnbbb"

read_xlsx("./data/import_test.xlsx", range = cell_rows(1:3))
#> # A tibble: 2 x 8
#>   Subject Sex      Age Height `Wei-ght` Score quote_escape eol        
#>   <chr>   <chr>  <dbl>  <dbl>     <dbl> <dbl> <chr>        <chr>      
#> 1 001     Female    26    165        64    89 "\"aa\""     "aaa\\nbbb"
#> 2 002     Female    28    170        67    78 "\"65\""     "aaa\\bbb"

read_xlsx("./data/import_test.xlsx", range = cell_cols(1:5))
#> # A tibble: 6 x 5
#>   Subject Sex      Age Height `Wei-ght`
#>   <chr>   <chr>  <dbl>  <dbl>     <dbl>
#> 1 001     Female    26    165        64
#> 2 002     Female    28    170        67
#> 3 003     Male      30    181        80
#> 4 004     Male      25    160        60
#> 5 005     Female    36    175        60
#> 6 006     Male      19    163        68

col_types: Either NULL to guess all from the spreadsheet or a character vector containing one entry per column from these options: “skip”, “guess”, “logical”, “numeric”, “date”, “text” or “list”. If exactly one col_type is specified, it will be recycled.

read_xlsx("./data/import_test.xlsx", col_types = "text")
#> # A tibble: 6 x 8
#>   Subject Sex    Age   Height `Wei-ght` Score quote_escape eol        
#>   <chr>   <chr>  <chr> <chr>  <chr>     <chr> <chr>        <chr>      
#> 1 001     Female 26    165    64        89    "\"aa\""     "aaa\\nbbb"
#> 2 002     Female 28    170    67        78    "\"65\""     "aaa\\bbb" 
#> 3 003     Male   30    181    80        64    "\"aa\""     "aaanbbb"  
#> 4 004     Male   25    160    60        87    "\"65\""     "ccc\\nbbb"
#> 5 005     Female 36    175    60        57    "\"aa\""     "ccc\\bbb" 
#> 6 006     Male   19    163    68        64    "\"65\""     "cccnbbb"

.name_repair: Handling of column names. By default, readxl ensures column names are not empty and are unique.

my_custom_name_repair <- function(nms) {tolower(gsub("[.-]", "_", nms))}
read_xlsx("./data/import_test.xlsx", .name_repair= my_custom_name_repair)
#> # A tibble: 6 x 8
#>   subject sex      age height wei_ght score quote_escape eol        
#>   <chr>   <chr>  <dbl>  <dbl>   <dbl> <dbl> <chr>        <chr>      
#> 1 001     Female    26    165      64    89 "\"aa\""     "aaa\\nbbb"
#> 2 002     Female    28    170      67    78 "\"65\""     "aaa\\bbb" 
#> 3 003     Male      30    181      80    64 "\"aa\""     "aaanbbb"  
#> 4 004     Male      25    160      60    87 "\"65\""     "ccc\\nbbb"
#> 5 005     Female    36    175      60    57 "\"aa\""     "ccc\\bbb" 
#> 6 006     Male      19    163      68    64 "\"65\""     "cccnbbb"

Import SAS files into R: haven::read_sas

library(haven)

Arguments	Default	Option
catalog_file	`NULL`
encoding	`NULL`
catalog_encoding	`NULL`
n_max	`Inf`	integer
skip	`0`	integer
col_select	`NULL`	a vector, `c('subject', 'age')`
.name_repair	`"unique"`	default or customize

col_select: One or more selection expressions

sas <- read_sas("./data/training.sas7bdat", col_select = c("Age", "Score", "quote_escape"))
sas
#> # A tibble: 6 x 3
#>     Age Score quote_escape
#>   <dbl> <dbl> <chr>       
#> 1    26    89 "\"aa\""    
#> 2    28    78 "\"65\""    
#> 3    30    64 "\"aa\""    
#> 4    25    87 "\"65\""    
#> 5    36    57 "\"aa\""    
#> 6    19    64 "\"65\""

Data Output

Write a data frame to a comma-delimited file: `readr::write_csv`

Arguments	Default	Option
file		File or connection
delim		`','`, `';'`, `'/'`
na	`"NA"`	string
append	`FALSE`	`FALSE`/`TRUE`
col_names	`TRUE`	`FALSE`/`TRUE`
quote_escape	`"double"`	`"double"`, `"backslash"`, `"none"`
eol		`"\n"`

append: If FALSE, will overwrite existing file. If TRUE, will append to existing file. In both cases, if the file does not exist a new file is created.

getwd()
if (!dir.exists('./output')) {dir.create('./output')}

write_csv(sas, "./output/output.csv", append = FALSE)
write_csv(sas, "./output/output.csv", append = TRUE)

col_names: If FALSE, column names will not be included at the top of the file. If TRUE, column names will be included. If not specified, col_names will take the opposite value given to append.
```
write_csv(sas, "./output/output.csv", col_names = FALSE)
write_csv(sas, "./output/output.csv", col_names = TRUE)
```
eol: The end of line character to use. Most commonly either "\n" for Unix style newlines, or "\r\n" for Windows style newlines.
```
write_csv(sas, "./output/output.csv", eol = " a\n")
```
quote_escape: The type of escaping to use for quoted values, one of "double", "backslash" or "none".
```
write_csv(sas, "./output/output.csv", quote_escape = "none")
```

Write a data frame to a excel file: `openxlsx::write.xlsx`

Arguments	Default	Option
asTable	`FALSE`
creator	`NULL`	string
sheetName	`NULL`	string
gridLines	`TRUE`	`TRUE`/`FALSE`
tabColour	`NULL`	yellow
zoom	`100`	`10`—`400`
startCol	`1`	vector or numberic
startRow	`1`	vector or numberic
xy	`c(1, 1)`	vector
colNames/col.names	`TRUE`	`TRUE`/`FALSE`
rowNames/row.names	`FALSE`	`TRUE`/`FALSE`
headerStyle	`NULL`	custom by `createstyle()`
borders	`NULL`	`"surrounding"`, `"columns"` or `"rows"` or `NULL`
borderColour	`NULL`	color
borderStyle	`NULL`	custom by `createstyle()`
keepNA	`FALSE`	`TRUE`/`FALSE`
na.string	`NULL`	string
firstActiveRow	`1`	numeric
firstActiveCol	`1`	numeric
firstRow	`FALSE`	`TRUE`/`FALSE`
firstCol	`FALSE`	`TRUE`/`FALSE`
colWidths	`NA`	`"auto"`
overwrite	`TRUE`	`TRUE`/`FALSE`

asTable: whether with border and filter button, FALSE by default
creator: A string specifying the workbook author
sheetName: Name of the worksheet
gridLines: A logical. If FALSE, the worksheet grid lines will be hidden.
tabColour: Colour of the worksheet tab

zoom: A numeric between 10 and 400. Worksheet zoom level as a percentage.

write.xlsx(
  sas,
  "./output/output.xlsx",
  asTable = TRUE,
  creator = "yuhang",
  sheetName = 'name',
  gridLines = TRUE,
  tabColour = 'yellow',
  zoom = 200
)

startCol: A vector specifying the starting column(s) to write df
startRow: A vector specifying the starting row(s) to write df
xy: An alternative to specifying startCol and startRow individually. A vector of the form c(startCol, startRow)
colNames/col.names: If TRUE, column names of x are written.
rowNames/row.names: If TRUE, row names of x are written.
headerStyle: Custom style to apply to column names.
borders: Either "surrounding", "columns" or "rows" or NULL.
borderColour: Colour of cell border, only borders is not NULL

borderStyle: Border line style, customize by createstyle()

write.xlsx(
  sas,
  "./output/output.xlsx",
  startCol = 4, startRow = 3,
  # xy = c(2, 3),
  col.names = FALSE,
  rowNames = TRUE,
  borders = "surrounding", borderColour = "blue"
)

keepNA: If TRUE, NA values are converted to #N/A (or na.string, if not NULL) in Excel, else NA cells will be empty. Defaults to FALSE.
na.string: If not NULL, and if keepNA is TRUE, NA values are converted to this string in Excel. Defaults to NULL.
firstActiveRow: Top row of active region to freeze pane, begin at 1.
firstActiveCol: Furthest left column of active region to freeze pane, begin at 1.
firstRow: If TRUE, freezes the first row (equivalent to firstActiveRow = 2).
firstCol: If TRUE, freezes the first column (equivalent to firstActiveCol = 2).
colWidths: Must be value "auto". Sets all columns containing data to auto width.

overwrite: Overwrite existing file (Defaults to TRUE as with write.table)

write.xlsx(
  sas,
  "./output/output.xlsx",
  firstActiveRow = 3, firstActiveCol = 3,
  # firstRow = TRUE, firstCol = TRUE,
  colWidths= "auto"
)

Open more than one dataframe into different sheets

l <- list(sas, sas, sas)
write.xlsx(
  l,
  "./output/writeList1.xlsx",
  sheetName = c('a', 'b', 'c'),
  colWidths = c(NA, "auto", "auto"),
  startCol = c(1, 2, 3),
  startRow = 2,
  asTable = c(TRUE, TRUE, FALSE),
  withFilter = c(TRUE, FALSE, FALSE)
)

File/folder operations

Folder operations

dir.exists(): judge whether location exist, return TRUE or FALSE
dir.create(): create a path
unlink(): deletes the file(s) or directories

base::dir.exists("./output/test")
dir.create("./output/test/r", recursive = TRUE)
unlink("./output/test/r", recursive = TRUE)
dir.create("./output/test")
base::dir.exists("./output/test")
unlink("./output/test")

File operations

Functions	Usage
file.create	creates files with the given names
file.exists	indicating whether the files exist
file.remove	remove the files
file.rename	rename files
file.copy	copy files
dir/list.files	list the names of files or directories in the named directory

file.create("./output/test/r/read.txt")
file.exists("./output/test/r/read.txt")
file.rename('./output/test/r/read.txt', './output/test/r/read2.txt')
file.copy('./output/test/r/read2.txt', './output/test/r/read.txt')
dir('./output/test/r')
file.remove('./output/test/r/read2.txt')

Data Import and Write