R statistics

R statistics – basic data analysis with automation

To read in excel files
library ( gdata )
# Note the forward slash on windows
# Set the working directory
setwd(“C:/Documents and Settings/csnyder/Desktop/My Dropbox/Projects/NEC/Data/”)
# Check it
(WD <- getwd())
#mdat <- read . xls ( “C:/Documents and Settings/csnyder/Desktop/My Dropbox/Projects/NEC/Data/yourfile.xls”)
# mdat <- read . xls ( “yourfile.xls”)
# Attach it if you wish
attach(mdat)

Set some identifiers
# mdat <- “my_data_frame” # If not already labeled appropriately
# miv <- “my_independent_variable” # This is the focus of your study
# This (below) does not work!
miv <- names(Term)

# What are the continuous variables?
# Create a subset dataframe with only these variables and the independent variable
mdat_continV <- subset ( mdat [ , c ( "Term" , "BWt" , "EGA" , "AgeatOp" , "AgeFirstFed" , "DaysatDx" , "LOS" , "Apgar1" , 'Apgar5' )])
View ( mdat_continV )

# Only the complete cases (no NA’s anywhere) – may skip this
# mdat_continV <-mdat_continV[complete.cases(mdat_continV),]

Create some functions – T-test
# Create a ttest function:
library ( plyr )
tt <- function ( x ) t . test ( x ~ Term , data = mdat_continV)
# One test is easy – supply the parameter to the function: tt(mdat$EGA)
# Use the colwise function in plyr
# colwise(function)(dataframe[c("col1", "col2")])
# Now, get the col names, skipping the first 2:
# names(mdat_continV)[-(1:3)]
colwise ( tt )( mdat_continV [ names ( mdat_continV )[-( 1 : 3 )]])
# The ‘code’ to read these results is
# 1 – t value
# 2 – degrees of freedom
# 3 – p value
# 4 – 95% confidence interval
# 5 – sample means compared (grp 0 and grp 1)
# 6 – ?
# 7 – 2-sided or 1-sided T test
# 9 – dependent variable by independent variable

# Here is another method to do apply t.test to several columns (cols 8 and 9 are the EGA and Bwt:
sapply(mdat[,8:9], FUN=tt) # returns a matrix
# The same function with this nomenclature can be used:
sapply(mdat[c("BWt", "EGA")], FUN=tt) # returns a matrix

Create some functions – Cross-tab and chi squared

# Function for chisq test:
csq_test <- function (x){
mresult <- xtabs(~Term+x, data = mdat, sparse = FALSE, exclude = c(NA, NaN), drop.unused.levels = FALSE)
print(mresult)
summary(mresult)
}
csq_test(Survived)

Create some graphics – Plot one continuous variable against another

# Define the plotting function
library (plyr)
Library(ggplot2)
# Form is: my_plot_fxn <- function(.col) ggplot(my_df) +
# geom_point(aes_string(x=”column_name_of_independ_variable”, y=.col))
plotone <- function ( .col ) ggplot ( mdat ) +
geom_point ( aes_string ( x = “DaysatDx” , y = .col ))
# Call the function with a list of the names of the
# columns you want
l.ply ( names ( mdat )[ c ( 8:9 , 11 )] , plotone , .print = T )

Create some graphics – Make a series of boxplots

library ( reshape2 )
graphcls <- function (mdat_continV) {
mdat_continV.m <- melt (mdat_continV , id.vars = “Term” )
ggplot (mdat_continV.m , aes ( x = factor ( Term ) , y = value , colour = variable )) +
geom_boxplot () +
ylab ( “Total Differences” )
}
# call the function with col 1 (term=your independent factor variable) and whatever col you need
# ignore the NAs by taking only the subset without them:
# mmdd <- subset(mdat_continV, subset= (Term <5 & DaysatDx < 90))
graphcls ( mdat_continV [ c ( 1 , 3 )])
graphcls ( mdat_continV [ c ( 1 , 7 )])
graphcls ( mdat_continV [ c ( 1 , 6 )])

Create some graphics – Make a series of boxplots (alternative)

library ( ggplot2 )
# This plot will work, but the NA’s are unsightly
# qplot ( factor ( Term ) , EGA , data = mdat , geom = “boxplot” )
# Eliminate the NA’s
# qplot ( factor ( Term ) , EGA , data = subset ( mdat , ! is.na ( EGA )) , geom = “boxplot” )
# Better: Same Plot but more fine control; colour outliers and label the axis
p <- ggplot ( subset ( mdat , ! is na ( EGA )) , aes ( factor ( Term ) , BWt ))
p + geom_boxplot ( outlie .colour = “red” , outlier.size = 3 ) + xlab ( “Term” )
# It is possible to create a plot inside a function, and simply call the function with the appropriate variable. Here “Term” is the
# independent binary variable
# Remove any NA’s in the independent variable:
mdat <- subset ( mdat , ! is.na ( Term )
mplot <- function ( x ) {
qplot ( factor ( Term ) , x , data = mdat , geom = “boxplot” )
}
mplot ( mdat$LOS )

Cleaning up imported data

# Any empty rows after the import?
# What are they? (in form c(empty_row_start:empty_row_end)
# Delete empty rows at the end
mdat <- mdat [- c( 492:495 ) , ]

# What are the column names?
names ( mdat )
# What are the column names, skipping the first 3 columns?
names ( yourdf )[-( 1:3 )]

# What if the column names need to be changed? This changes the name of col 2
colnames ( yourdf )[2] <- “newname”

# How many different values are in the rows of a specific column?
table ( mdat$YearDx )
# or, another way:
xtabs (~ YearDx , data = mdat )

# To do a more extensive cross-tabulation:
xtabs (~ YearDx + EGA + Survived , data = mdat )

# How many NA values are there in each column?
library ( plyr )
nmissing <- function ( x ) sum ( is.na ( x ))
table ( colwise ( nmissing )( mdat ))
# How do we deal with the NA values?
# Format is newdf <- olddf[complete.cases(olddf[, c("col_name1", "col_name2")]),]
# for NAs in a single column: newdf<-olddf[complete.cases(olddf[,"co_name1"]),]
mdat1 <- mdat [ complete.cases ( mdat [ , c( "Term" , "BWt" , "EGA" )]) , ]

# Bad data – we find there is an “F” instead of a “0″ for gender:
mdat $ Sex [ mdat $ Sex == "F" ] <- 0

# Ignore the rows with NA for Term:
mdat <- subset ( mdat , ! is.na ( Term ))

Reviewing summary variables

# To find the mean of the numerical columns, grouped by whether or not they are M/F or another binary variable
ddply ( mdat , .(Term) , colwise ( mean))
# Suppose you want to test various numerical (continuous) columns of data based on whether or not the baby is Term (0,1):
# This can be done with plyr, using colwise:
# Now, apply it to many columns with the colwise function
colwise ( tt )( mdat [ c ( "BWt" , "EGA" )])
# This is the best way to do it
sapply ( mdat [ , 8 : 9 ] , FUN = summary ) # returns a matrix

# What was the mean birthweight for term vs non term babies?
ddply ( mdat , “Term” , function ( mdat ) mean ( mdat$BWt , na.rm = T ))
# What were the number of survivors for term vs non term babies?
ddply ( mdat , “Term” , function ( mdat ) sum ( mdat$Survived , na.rm = T ))

library ( reshape2 )
# This will make 3 columns – the name of the two identifiers (Term and EGA), and the measured result (EGA)
melt ( mdat , “Term” , c ( “EGA” ) , na . rm = T )

# How many term infants versus non term?
myx <- arrange ( count ( mdat , “Term” ) , desc ( freq ))
myx

# What is the mean and sd for various values with term or non term infants?
aggregate ( EGA ~ Term , data = mdat , FUN = function ( x ) c ( M = mean ( x ) , SD = sd ( x )))

General

# What example data is available?
data ()

# Eliminate any rows with missing or NA data in the col of choice:
mdat2<-mdat[complete.cases(mdat1[,"BreastFed"]),]
View(mdat2)

Concatenate multiple word (or other format) files into another format

Concatenate

Here is an easy way to concatenate multiple word (or other format) files into another format – in this case, text.

  1. cd to the directory containing the files

  2. Run the following from the command line:

textutil -cat txt -title "Several Files" -output merged.txt *.doc
  1. This uses textutil to bring together (cat) all *.doc files in the directory (folder) and output them as merged.text in the same directory

  2. Link to textutil