#R-pieSales Regression of pie sales on price and advertising expenses.
#all lines starting with # are ignored by R. They are comments for u to
#to understand what the R code is doing. Good programmers include lots of comments
# www.fordham.edu/economics/vinod/ch14ppln.ppt has a powerpoint
rm(list=ls())#good way to start a session
# it cleans out stuff from the memory of R (used by someone else?)
paste("Today is", date())
y=c(350, 460, 350, 430, 350, 380, 430, 470, 450, 490, 340, 300, 440,
450, 300)
#y is apple pie sales. c means catalogue . This is the simplest way of reading
#data into R. More advanced ways are in the appendix to this document.
# you should get back the prompt > from R suggesting that u have
#fully entered the data. If you get + sign, R wants more stuff
#then something is wrong and get out by pressing the escape key
x=c(5.5, 7.5, 8.0, 8.0, 6.8, 7.5, 4.5, 6.4, 7.0, 5.0, 7.2, 7.9, 5.9,
5.0, 7.0)
#price charged
z=c(3.3, 3.3, 3.0, 4.5, 3.0, 4.0, 3.0, 3.7, 3.5, 4.0, 3.5, 3.2, 4.0, 3.5, 2.7)
#advertising expenses
#Check data Entry
y; plot(y) #this lets u know if R understood your typing correctly
hist(y)#draws histogram of y data
boxplot(y)# draws box plot. See if these plots make sense
x;plot(x)#note that R is case sensitive and wrong upper case will fail
hist(x)#draws histogram of y data
boxplot(x)# draws box plot
z;plot(z)#semicolon is needed if more than one command is on the same
line
hist(z)#draws histogram of y data
boxplot(z)# draws box plot
#Advanced Check data Entry Omit at the elementary level
#sophisticated box plots for fun
library(gtools)#need to download this package and bring into memory
g=quantcut(x) #create a factor variable from four quarters of data
boxplot(split(x,g), col="lavender", notch=TRUE)#4 notched box plots
# Understanding the data with tests on null hypothesis mean mu=0
t.test(x)
sort(x) #prints sorted data to the screen
t.test(y)
sort(y)
t.test(z)
sort(z)
#Learn how to download a package from the Internet into R
# click on packages menu and choose Set CRAN mirror ...
#choose a location near you say PA2 Pennsylvania from New York
#Click on on install packages ... A long alphabetic list will appear
#HIGHLIGHT the name fBasics and click on OK at bottom
#another package called car is needed for the durbin.Watson test
library(fBasics) #this brings it into current memory of R
#printing all basic stats together by one line command
basicStats(cbind(y,x,z)) #also works
# y x z
#nobs 15.000000 15.000000 15.000000
#NAs 0.000000 0.000000 0.000000
#Minimum 300.000000 4.500000 2.700000
#Maximum 490.000000 8.000000 4.500000
#1. Quartile 350.000000 5.700000 3.100000
#3. Quartile 450.000000 7.500000 3.850000
#Mean 399.333333 6.613333 3.480000
#Median 430.000000 7.000000 3.500000
#Sum 5990.000000 99.200000 52.200000
#SE Mean 16.401703 0.302508 0.126190
#LCL Mean 364.155178 5.964518 3.209350
#UCL Mean 434.511488 7.262149 3.750650
#Variance 4035.238095 1.372667 0.238857
#Stdev 63.523524 1.171609 0.488730
#Skewness -0.215868 -0.431478 0.373353
#Kurtosis -1.572078 -1.343088 -0.847871
#nobs= number of observations,
#NAs=not available or missing data, usually We want zero missing data!
#1. Quartile means first quartile, 3. Quartile is third quartile
#SE Mean is standard error of mean for testing mu=0,
#LCL Mean is lower 95% confidence limit for mean, UCL Mean=upper limit
# Skewness, Kurtosis are Pearson's measures based on moment orders 3 and 4.
#******
#BEGIN OUTLIER DETECTION FUNCTION
#following function (about 30-lines of code) automatically computes outliers
get.outliers = function(x) {
#function to compute the number of outliers automatically
#author H. D. Vinod, Fordham university, New York, 24 March, 2006
su=summary(x)
if (ncol(as.matrix(x))>1) {print("Error: input to get.outliers function
has 2 or more columns")
return(0)}
iqr=su[5]-su[2]
dn=su[2]-1.5*iqr
up=su[5]+1.5*iqr
LO=x[xup]
nUP=length(UP)
print(c(" Q1-1.5*(inter quartile range)=",
as.vector(dn),"number of outliers below it
are=",as.vector(nLO)),quote=F)
if (nLO>0){
print(c("Actual values below the lower limit are:", LO),quote=F)}
print(c(" Q3+1.5*(inter quartile range)=",
as.vector(up)," number of outliers above it
are=",as.vector(nUP)),quote=F)
if (nUP>0){
print(c("Actual values above the upper limit are:", UP),quote=F)}
list(below=LO,nLO=nLO,above=UP,nUP=nUP,low.lim=dn,up.lim=up)}
#xx=get.outliers(x)
# END FUNCTION here = = = = = = Must copy all the way to this line
#Now assuming x, y and z are already in memory, use the outlier function as:
xx=get.outliers(x)
#if outliers are present on either side, x data may need another look!
xx=get.outliers(y)
xx=get.outliers(z)
#Tests for Correlation Coefficients One pair at a time
cor.test(x,y)
#capture.output(cor.test(x,y), file="c:/stat2/PieSaleOutput.txt",
#append=T)
cor.test(z,y)
#capture.output(cor.test(z,y), file="c:/stat2/PieSaleOutput.txt",
#append=T)
cor.test(x,z)
#capture.output(cor.test(x,z), file="c:/stat2/PieSaleOutput.txt",
#append=T)
#creating a matrix of correlation coefficients
vmtx=cbind(y,x,z)#binds columns of data into matrix
cor(vmtx)
#this reveals possible linear relation for each pair of variables only
#Now REGRESSION ANALYSIS needed if 3 or more variables are involved
reg1=lm(y~x+z)# R function called lm=linear model is for multiple regression
#reg1 is name of the object which contains my first regression results
#I could have chosen any name I wish. It is used several times below
#
summary(reg1) #this is needed to get detailed output of regression
#plot(reg1)
#gives sophisticated plots of residuals
#you may need to hit "enter" a few times to look at these
#DO NOT attach these to your term paper. You do not know what
#they mean. They are just for your info and later use.
#capture.output(summary(reg1), file="c:/stat2/PieSaleOutput.txt", append=T)
confint(reg1) #prints 95% confidence intervals for coefficients
#capture.output(confint(reg1), file="c:/stat2/PieSaleOutput.txt", append=T)
confint(reg1, level=0.99) #prints 99% confidence intervals
#confint does not need the package called car, it is generic to stats
r1=resid(reg1)#creates an object called r1 containing residuals
sum(r1^2) #prints sum of squared residuals
anova(reg1) #prints the analysis of variance table and F values for each
#regressor separately. For the F test needed in the paper use the
# LAST line of summary(reg1) output by R
# F-statistic: 6.539 on 2 and 12 DF, p-value: 0.01201
ano=anova(reg1) #ano object has anova results for further manipulation
regSS=ano$S[1]+ano$S[2] #extracts regression sum of squares $S of sum
#of sq
regSS
residSS=ano$S[3] #extract residual sum of squares
residSS
totalSS=regSS+residSS #INDIRECT estimate of total sum of squares
totalSS
totSS=var(y)*(length(y)-1)
totSS #direct estimate of total sum of squares should match exactly
#Finding CRITICAL VALUES from t table and F table using qt and qf
alp=0.05 #the alpha value
alpby2=alp/2 #defines alpha by 2 for typical t test
dft=length(y)-3 #defines degrees of freedom for t
qt(alpby2,df=dft)#quantile of t distribution
qf(alp, df1=2, df2=dft, lower.tail=F)#quantile of F distribution
#Finding p-values from t table and F table using pt and pf
#pvalues are already given by R
#for example t value in summary table is -2.306 for the pie-sales example
pt(-2.306,df=dft, lower.tail=T)*2
# we multiply tail area by 2 to get the p value= 0.03976325
#doubling is needed because t test is two-sided
#for the overall F test, the p value is obtained from one-sided case
pf(6.539,df1=2,df2=12, lower.tail=F)#we want 1-sided upper tail test
#here we DO NOT DOUBLE the answer 0.01200411, F test is one-sided here
#PLOTTING main means main heading, xlab is Label for x axis
plot(x,y, main="Fig. 1: Demand Curve ", xlab="Price", ylab="Sales")
#abline(lm(y~x)) #this draws a straight line of regression in the scatter-plot
#lines(lowess(x,y))# this suggests the best fitting free-hand nonlinear line
#if the free-hand curve is too different from the straight line,
# we may need a nonlinear model, you may want to say this in discussion
#of future work.
library(car) #install and bring package into current memory of R
scatterplot(x,y, main="Fig. 1: Demand Curve ", xlab="Price",
ylab="Sales")
#this will also give box plots for the two variables plus lowess lines
plot(z,y, main="Fig.2:Z against Y", xlab="Z", ylab="Y")
#abline(lm(y~z)) #this draws a line of regression in the scatter-plot
#lines(lowess(z,y))
# the free hand locally weighted scatter-plot smoother plot suggests
#the presence of nonlinearity, remedied by adding square terms etc.
#perhaps reg2=lm(y~x+z+I(z^2)) might be better. Note I(z^2) as additional
#regressor, +z^2 will not work as square term, need the I(...)
scatterplot(z,y, main="Fig.2:Z against Y", xlab="Z", ylab="Y")
r1=resid(reg1)
plot(x,r1, main="Fig. 3: Residuals against X", xlab="X",
ylab="Residuals")
# ideally this plot should show no particular pattern
#otherwise a basic assumption of regression is not satisfied by our model
plot(z,r1, main="Fig. 4: Residuals against Z", xlab="Z",
ylab="Residuals")
# ideally this plot should show no particular pattern
#a package called car prints the results of the durbin.watson (DW) test
library(car)
durbin.watson(reg1)
# needed if the residual plots show serial correlation pattern
#type help(durbin.watson) for details
fity=fitted(reg1)
plot(y,fity, main="Fig. 5: Fitted against Actual y", xlab="Actual Y",
ylab="Fitted Y")
# Good fit =this figure should be close to a virtual 45 degree line
#the actual angle will depend on the scales along axes
#it will be 45 degree only if origin+ scale of axes match exactly
lines(c(min(y),max(y)), c(min(y),max(y)))#draws the 45 degree line
miny=min(y)
minfy=min(fity)
minboth=min(miny,minfy)
maxy=max(y)
maxfy=max(fity)
maxboth=max(maxy,maxfy)
plot(y,fity, main="Fig. 5: Fitted against Actual y", xlab="Actual Y", ylab="Fitted Y",
xlim=c(minboth,maxboth), ylim=c(minboth,maxboth) )
lines(c(minboth,maxboth), c(minboth,maxboth))#draws the 45 degree line
# Get More detailed info about the fit graphically as follows
library(car)
scatterplot(y,fity,main="Fig. 5: Fitted against Actual y", xlab="Actual
Y", ylab="Fitted Y")# gives useful info about quartiles of y and fity
lines(c(min(y),max(y)), c(min(y),max(y)))#virtual 45 degree line
#dashed line is different from virtual 45 degree solid line?
#Regression with additional regressor for squared x
reg2=lm(y~x+z+I(x^2))
anova(reg2) # see the contribution of adding the sq term
summary(reg2)#compare adjusted R-square to decide if adding sq
#term helps or hurts the model. Sometimes nonlinear terms are
#needed even if they reduce the adjusted r-square
APPENDIX METHODS OF READING DATA INTO R
METHOD A Create and Read a text file
In the following example, create a text file with 3 columns Open a blank MS word
document. Type in the first row x y z as column names (so we will say header=True)
The actual data is lined up under the column names.
x y z
1 2 3
4 5 7
8 9 10
7 11 11
6 5 10
Save this MS Word document as a file using "saved as" a text file "test-r-data.txt" in
some directory. In my case I use the directory "data".It is a good idea to pick the txt
extension for text files. Use windows explorer (My computer--> local disk--> data-->
etc. In the tools menu click on folder options. . .Then get view and be sure that there is no
check mark on "Hide extensions for known file types". Now make sure the file is there
under the right name. Only then use following R command
myxyz <- read.table("c:\\data\\test-r-data.txt" , header=T)
#Note that we did not bother with changing most defaults.
# note also that we need double \\ here
summary(myxyz)
METHOD B Reading 3 columns from an Excel file
If we have data in 3 named columns of excel workbook then use
"save as..." csv file which means comma separated file
Then the command is
myxyz <- read.table("c:\\data\\test-r-data.csv" , header=T, sep=",")
#Note that we have separator set to be a comma
summary(myxyz)
METHOD C
myxyz <- scan("c:\\data\\test-r-data.txt" , skip=1)
#need the skip=1 to not read the first line with headers
#need following to make it into a 6 by 3 matrix
#have to know the row and column dimension
myd=matrix(myxyz,6,3)
myd #this will print the same data without the column headers