Yuan Tian

PhD life | Computer & Health Science | Music and Dance | Views Are My Own

Handbook of Statistical Analyses Using R

Categories: r   

Yuan Tian Posted at — Nov 12, 2019

1 Chapter 1 - Introduction to R

1.1 Import the Dataset

1.1.1 .csv File

library(HSAUR3)
## Loading required package: tools
# Method 1: 
 data("Forbes2000", package = "HSAUR3")
# Method 2: 
#csvForbes2000 <- read.table("Forbes2000.csv",header = TRUE, sep = ",", row.names = 1)

1.1.2 .xlsx File

library(xlsx)
# Method 1: 
#df <- read.xlsx("<name and extension of your file>", sheetIndex = 1)
# Method 2: 
#df <- read.xlsx2("<name and extension of your file>",sheetIndex = 1, startRow=2,colIndex = 2)
#Fun fact: according to the package information, the function achieves a performance of an order of magnitude faster on sheets with 100,000 cells or more.
# Method 3:
#R> library("RODBC")
#R> cnct <- odbcConnectExcel("Forbes2000.xls")
#R> sqlQuery(cnct, "select * from \"Forbes2000\\$\"")

1.1.3 .rda Files

Search .rda files and load it:

#list.files(patterns = "\\.rda")
#load("Forbes2000.rda")

1.2 Export the Dataset

#write.table(Forbes2000, file = "Forbes2000.csv", sep = ",", col.names = NA)
# Or write.csv; 
# Or save(Forbes2000, file="Forbes2000.rda")

1.3 Get the Meta-data of the Dataset

Load the Forbes2000 dataset.

# get help - ?function_name (e.g. ?nrows)
# basic meta-data
class(Forbes2000)
## [1] "data.frame"
dim(Forbes2000)
## [1] 2000    8
nrow(Forbes2000)
## [1] 2000
ncol(Forbes2000)
## [1] 8
names(Forbes2000)
## [1] "rank"        "name"        "country"     "category"    "sales"      
## [6] "profits"     "assets"      "marketvalue"
class(Forbes2000[,"rank"])
## [1] "integer"
length(Forbes2000[,"rank"])
## [1] 2000
Forbes2000[,"name"][1]
## [1] "Citigroup"

1.4 Simple Summary Statistics

Factor and numeric variables:

#for factor 
class(Forbes2000[,"category"])
## [1] "factor"
nlevels(Forbes2000[,"category"])
## [1] 27
levels(Forbes2000[,"category"])
##  [1] "Aerospace & defense"              "Banking"                         
##  [3] "Business services & supplies"     "Capital goods"                   
##  [5] "Chemicals"                        "Conglomerates"                   
##  [7] "Construction"                     "Consumer durables"               
##  [9] "Diversified financials"           "Drugs & biotechnology"           
## [11] "Food drink & tobacco"             "Food markets"                    
## [13] "Health care equipment & services" "Hotels restaurants & leisure"    
## [15] "Household & personal products"    "Insurance"                       
## [17] "Materials"                        "Media"                           
## [19] "Oil & gas operations"             "Retailing"                       
## [21] "Semiconductors"                   "Software & services"             
## [23] "Technology hardware & equipment"  "Telecommunications services"     
## [25] "Trading companies"                "Transportation"                  
## [27] "Utilities"
table(Forbes2000[,"category"])
## 
##              Aerospace & defense                          Banking 
##                               19                              313 
##     Business services & supplies                    Capital goods 
##                               70                               53 
##                        Chemicals                    Conglomerates 
##                               50                               31 
##                     Construction                Consumer durables 
##                               79                               74 
##           Diversified financials            Drugs & biotechnology 
##                              158                               45 
##             Food drink & tobacco                     Food markets 
##                               83                               33 
## Health care equipment & services     Hotels restaurants & leisure 
##                               65                               37 
##    Household & personal products                        Insurance 
##                               44                              112 
##                        Materials                            Media 
##                               97                               61 
##             Oil & gas operations                        Retailing 
##                               90                               88 
##                   Semiconductors              Software & services 
##                               26                               31 
##  Technology hardware & equipment      Telecommunications services 
##                               59                               67 
##                Trading companies                   Transportation 
##                               25                               80 
##                        Utilities 
##                              110
#for numeric variables
class(Forbes2000[,"sales"])
## [1] "numeric"
median(Forbes2000[,"sales"])
## [1] 4.365
range(Forbes2000[,"sales"])
## [1]   0.01 256.33
summary(Forbes2000[,"sales"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.010   2.018   4.365   9.697   9.547 256.330
comments powered by Disqus