Difference between revisions of "R Programming Language Basics"
Jump to navigation
Jump to search
Line 7: | Line 7: | ||
<pre> | <pre> | ||
> data=read.table('Statistic.csv') | > data=read.table('Statistic.csv') | ||
+ | > data=read.table('Skeleton.csv', header=TRUE) | ||
+ | > data=read.table('Skeleton.csv', header=T) | ||
# Need two backslashes for Windows file system | # Need two backslashes for Windows file system | ||
> data=read.table('C:\Documents\My R\Life Expentancy.txt') | > data=read.table('C:\Documents\My R\Life Expentancy.txt') | ||
− | Error: '\ | + | Error: '\D' used without hex digits in character string starting "'C:\U" |
> data=read.table('C:\\Documents\\My R\\Life Expectancy.txt') | > data=read.table('C:\\Documents\\My R\\Life Expectancy.txt') | ||
+ | </pre> | ||
+ | |||
+ | * Assign basic data to a variable | ||
+ | <pre> | ||
+ | > six_grades = c(68, 64, 90, 74, 78, 93) | ||
+ | > sort(six_grades) | ||
+ | [1] 64 68 74 78 90 93 | ||
</pre> | </pre> | ||
Line 18: | Line 27: | ||
<pre> | <pre> | ||
> data | > data | ||
− | 1 | + | V1 V2 |
− | + | 1 Afghanistan 48.673 | |
− | + | 2 Albania 76.918 | |
+ | 3 Algeria 73.131 | ||
+ | |||
+ | # Show in a table format using the “table” syntax | ||
+ | > table(data) | ||
+ | region | ||
+ | Amer EAP EuCA MENA SAs SSA | ||
+ | 39 30 50 21 8 49 | ||
+ | |||
</pre> | </pre> | ||
* Assign a specific column to variable | * Assign a specific column to variable | ||
<pre> | <pre> | ||
− | > | + | > LifeExp=data[,2] |
+ | </pre> | ||
+ | |||
+ | * Assign all columns to variables | ||
+ | <pre> | ||
+ | > attach=(data) | ||
+ | # Now refer each column using the variable name which is the header column name | ||
</pre> | </pre> | ||
* Plotting | * Plotting | ||
<pre> | <pre> | ||
− | > plot( | + | > plot(LifeExp, xlab='Name', ylab='Score') |
− | > boxplot( | + | > plot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) |
+ | > boxplot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) | ||
+ | > boxplot(LifeExp, horizontal=TRUE, xlab='Name', ylab='Score', ylim=c(60,80)) | ||
+ | |||
+ | > plot(sort(size_grades), type='b', xlab='Student', ylab='Grade') | ||
+ | |||
</pre> | </pre> | ||
* View basic arithmetic summary data | * View basic arithmetic summary data | ||
<pre> | <pre> | ||
− | > summary | + | > summary(LifeExp) |
− | Min. | + | Min. 1st Qu. Median Mean 3rd Qu. Max. |
+ | 47.79 64.67 73.24 69.86 76.65 83.39 | ||
+ | |||
+ | > summary(LifeExp, digits=6) # Show more accurate in digits | ||
+ | </pre> | ||
+ | |||
+ | * Center of the data | ||
+ | <pre> | ||
+ | > mean(DGDifference) | ||
+ | [1] -14.15 | ||
+ | > mean(DGDifference, trim=0.1) # Trim 10% from the top and bottom | ||
+ | [1] -13.82188 | ||
+ | > median(DGDifference) | ||
+ | [1] -13 | ||
</pre> | </pre> | ||
+ | * Spread of data | ||
+ | <pre> | ||
+ | > range(LifeExp) | ||
+ | [1] 47.794 83.394 | ||
+ | > max(LifeExp)-min(LifeExp) | ||
+ | [1] 35.6 | ||
+ | > IQR(LifeExp) # Inter-Quartile Range (range between 1st and 3rd Quartile) | ||
+ | [1] 11.986 | ||
+ | > var(LifeExp) # Variance | ||
+ | [1] 93.48446 | ||
+ | > sd(LifeExp) # Standard deviation | ||
+ | [1] 9.668736 | ||
+ | > sqrt(var(LifeExp)) # Same Standard deviation | ||
+ | [1] 9.668736 | ||
+ | |||
+ | > round(sd(LifeExp)) | ||
+ | [1] 10 | ||
+ | > round(sd(LifeExp),1) # Keep on decimal point | ||
+ | [1] 9.7 | ||
+ | |||
+ | </pre> | ||
+ | |||
+ | * Variables | ||
+ | <pre> | ||
+ | > orig=c(23,4,66,107,12,45) | ||
+ | > trim=sort(orig) | ||
+ | > trim | ||
+ | [1] 4 12 23 45 66 107 | ||
+ | > trim=trim[2:5] | ||
+ | > trim | ||
+ | [1] 12 23 45 66 | ||
+ | |||
+ | > median = median(trim) | ||
+ | > mean = mean(trim) | ||
+ | > range = max(trim) - min(trim) | ||
+ | > iqr = IQR(trim) | ||
+ | > st.dev = sd(trim) | ||
+ | > all_values = c(median, mean, range, iqr, st.dev) | ||
+ | > combined = cbind(all_values, all_values2) # Combine two sets of data | ||
+ | > rownames(all_values) = c('median','mean','range','IRQ','Std.Dev') # Rename row labels | ||
+ | |||
+ | </pre> | ||
+ | |||
+ | * Shape of data | ||
+ | <pre> | ||
+ | # Histogram | ||
+ | > hist(lifeexp) | ||
+ | > hist(lifeexp, breaks=5) # 5 buckets | ||
+ | > hist(lifeexp, breaks=5, xlab = 'Life Expentancy (years)', main='Histogram of Life Expectancies') | ||
+ | |||
+ | </pre> | ||
+ | |||
+ | * Categorical (qualitative) variables | ||
+ | <pre> | ||
+ | |||
+ | </pre> | ||
[[Category: R]] | [[Category: R]] | ||
− |
Latest revision as of 10:50, 9 March 2014
- Set working directory
> setwd('~/Desktop/R_scripts')
- Read data
> data=read.table('Statistic.csv') > data=read.table('Skeleton.csv', header=TRUE) > data=read.table('Skeleton.csv', header=T) # Need two backslashes for Windows file system > data=read.table('C:\Documents\My R\Life Expentancy.txt') Error: '\D' used without hex digits in character string starting "'C:\U" > data=read.table('C:\\Documents\\My R\\Life Expectancy.txt')
- Assign basic data to a variable
> six_grades = c(68, 64, 90, 74, 78, 93) > sort(six_grades) [1] 64 68 74 78 90 93
- Review data
> data V1 V2 1 Afghanistan 48.673 2 Albania 76.918 3 Algeria 73.131 # Show in a table format using the “table” syntax > table(data) region Amer EAP EuCA MENA SAs SSA 39 30 50 21 8 49
- Assign a specific column to variable
> LifeExp=data[,2]
- Assign all columns to variables
> attach=(data) # Now refer each column using the variable name which is the header column name
- Plotting
> plot(LifeExp, xlab='Name', ylab='Score') > plot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) > boxplot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) > boxplot(LifeExp, horizontal=TRUE, xlab='Name', ylab='Score', ylim=c(60,80)) > plot(sort(size_grades), type='b', xlab='Student', ylab='Grade')
- View basic arithmetic summary data
> summary(LifeExp) Min. 1st Qu. Median Mean 3rd Qu. Max. 47.79 64.67 73.24 69.86 76.65 83.39 > summary(LifeExp, digits=6) # Show more accurate in digits
- Center of the data
> mean(DGDifference) [1] -14.15 > mean(DGDifference, trim=0.1) # Trim 10% from the top and bottom [1] -13.82188 > median(DGDifference) [1] -13
- Spread of data
> range(LifeExp) [1] 47.794 83.394 > max(LifeExp)-min(LifeExp) [1] 35.6 > IQR(LifeExp) # Inter-Quartile Range (range between 1st and 3rd Quartile) [1] 11.986 > var(LifeExp) # Variance [1] 93.48446 > sd(LifeExp) # Standard deviation [1] 9.668736 > sqrt(var(LifeExp)) # Same Standard deviation [1] 9.668736 > round(sd(LifeExp)) [1] 10 > round(sd(LifeExp),1) # Keep on decimal point [1] 9.7
- Variables
> orig=c(23,4,66,107,12,45) > trim=sort(orig) > trim [1] 4 12 23 45 66 107 > trim=trim[2:5] > trim [1] 12 23 45 66 > median = median(trim) > mean = mean(trim) > range = max(trim) - min(trim) > iqr = IQR(trim) > st.dev = sd(trim) > all_values = c(median, mean, range, iqr, st.dev) > combined = cbind(all_values, all_values2) # Combine two sets of data > rownames(all_values) = c('median','mean','range','IRQ','Std.Dev') # Rename row labels
- Shape of data
# Histogram > hist(lifeexp) > hist(lifeexp, breaks=5) # 5 buckets > hist(lifeexp, breaks=5, xlab = 'Life Expentancy (years)', main='Histogram of Life Expectancies')
- Categorical (qualitative) variables