############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-06-29 # # Topic(s): # - An Introduction to R # https://cran.r-project.org/doc/manuals/r-release/R-intro.html # - Section(s): 7.1 - 7.4 # # last updated: 2024-02-23 ############################################################################ ### 7: Reading data from files ############################################################################ ## 7.1: The read.table() function # check what the current working directory (folder) is getwd() # if this does not correspond to the location of this script and the # houses.txt datafile, then you need to change the working directory to this # location with the setwd() command, where you need to replace ... below with # the correct location setwd("...") # in RStudio, we can also click on Session - Set Working Directory - To Source # File Location (this sets the working directory to the location of this # script) # note: the example dataset shown at the beginning of this section has 'row # names' but this is quite specific to R and not what you usually would find # in a dataset that was exported for example from Excel, SPSS, or some other # software; therefore, the houses.txt file that we will work with does not # contain these row names # read in the data from the houses.txt file dat <- read.table("houses.txt", header=TRUE) # print the dataset dat ############################################################################ ## 7.2: The scan() function # read in the data from the houses.txt file using scan(); note: we need to # skip the first line since it is the header row that contains the variable # names and not actual data dat <- scan("houses.txt", list(Price=0,Floor=0,Area=0,Rooms=0,Age=0,Cent.heat=""), skip=1) dat # turn 'dat' into a data frame dat <- data.frame(dat) dat # it is not clear why one would ever want to use this kind of workflow; using # read.table() is simpler and more convenient ############################################################################ ## 7.3: Accessing builtin datasets # see what built-in datasets are currently available data() # one of the datasets is called 'mtcars'; we can directly access the dataset mtcars # for some packages with built-in datasets, one has to use the data() function # to actually load the dataset; this isn't really necessary for the 'mtcars' # dataset, but just for illustration purposes data(mtcars) # now there is a 'mtcars' object in the workspace ls() ############################################################################ ## 7.4: Editing data # to inspect a dataset, we can use the View() command View(mtcars) # edit the dat datset and assign the changed dataset to dat2 dat2 <- edit(dat) # never use this! this creates a non-reproducible workflow; if you need to # make adjustments to the dataset, you should do this with code (which you can # always re-run to make the same changes); for illustration purposes, say that # the 3rd house actually had 7 rooms (and not 5) dat2 <- dat dat2$Rooms[3] <- 7 dat2 # also, say that the Age value for the 4th house is actually unknown dat2$Age[4] <- NA dat2 # add a new variable to dat2 with some comments about the houses dat2$Comment <- "" dat2$Comment[1] <- "beautiful garden" dat2$Comment[3] <- "no basement" dat2$Comment[4] <- "blah # blup" ############################################################################ ## Saving data # save dat2 as a tab-delimited plain text file to houses_edit.txt without row # names and use 'blanks' for missing values (not NA) write.table(dat2, file="houses_edit.txt", row.names=FALSE, na="", sep="\t") # now we can read the data back into R with dat3 <- read.table("houses_edit.txt", header=TRUE, sep="\t") dat3 # note: when exporting dat2 to houses_edit.txt, strings are in quotes; # however, other software from which we might export a dataset may not do # this; then the # symbol in the dataset will cause problems, because # read.table() by default treats everything after # as a comment (i.e., # everything after # in a line is ignored); you can switch off this behavior # in read.table() with comment.char="" # save 'dat2' to an R data file called houses_edit.rdata save(dat2, file="houses_edit.rdata") # remove dat2 from the workspace rm(dat2) # load 'dat2' from houses_edit.rdata load("houses_edit.rdata") dat2 # disadvantage of .rdata files: they can only be read by R # advantages of .rdata files: # - they are typically smaller in size than .txt files (due to compression) # - they can be saved and loaded faster (which is only relevant when the # dataset is very large) # - they contain an exact representation of the object(s) that you saved # (e.g., factor and time/date variables are saved as such) ############################################################################ ## SPSS, Stata, Excel, and other file formats # the 'R Data Import/Export' manual contains a lot of additional information: # https://cran.r-project.org/doc/manuals/r-release/R-data.html # for reading in SPSS files, the 'foreign' package has a function called # read.spss() for reading in SPSS .sav files library(foreign) help(read.spss) # read in the houses_edit.sav file dat <- read.spss("houses_edit.sav", to.data.frame=TRUE) dat str(dat) # variable 'Cent.heat' which used value labels in SPSS is automatically turned # into a factor and the variable labels are an attribute of the dataset dat$Cent.heat attributes(dat)$variable.labels # the haven package can also read in SPSS files # install the haven package if it not already installed #install.packages("haven") # load the haven package library(haven) # read in the houses_edit.sav file dat <- read_spss("houses_edit.sav") dat str(dat) # turn the tibble into a regular data frame (and remove all the extra things # that are attached to the variables, like the variable labels) dat <- data.frame(zap_formats(zap_label(zap_labels(zap_widths(as_factor(dat)))))) dat # for Stata, there is read.dta() from the foreign package, but this can only # read in Stata version 5-12 datasets (even version 12 is quite old); the # readstata13 and haven packages can also read in more recent versions # for reading in data from Excel spreadsheets, see: # https://cran.r-project.org/doc/manuals/r-release/R-data.html#Reading-Excel-spreadsheets # the readxl package can read in both .xls and .xlsx Excel files and the # openxlsx package can read in .xlsx files ############################################################################