Difference between revisions of "R Programming Language Basics"
Jump to navigation
Jump to search
| Line 7: | Line 7: | ||
<pre> | <pre> | ||
> data=read.table('Statistic.csv') | > data=read.table('Statistic.csv') | ||
| + | > data=read.table('Skeleton.csv', header=TRUE) | ||
| + | > data=read.table('Skeleton.csv', header=T) | ||
# Need two backslashes for Windows file system | # Need two backslashes for Windows file system | ||
> data=read.table('C:\Documents\My R\Life Expentancy.txt') | > data=read.table('C:\Documents\My R\Life Expentancy.txt') | ||
| − | Error: '\ | + | Error: '\D' used without hex digits in character string starting "'C:\U" |
> data=read.table('C:\\Documents\\My R\\Life Expectancy.txt') | > data=read.table('C:\\Documents\\My R\\Life Expectancy.txt') | ||
| + | </pre> | ||
| + | |||
| + | * Assign basic data to a variable | ||
| + | <pre> | ||
| + | > six_grades = c(68, 64, 90, 74, 78, 93) | ||
| + | > sort(six_grades) | ||
| + | [1] 64 68 74 78 90 93 | ||
</pre> | </pre> | ||
| Line 18: | Line 27: | ||
<pre> | <pre> | ||
> data | > data | ||
| − | 1 | + | V1 V2 |
| − | + | 1 Afghanistan 48.673 | |
| − | + | 2 Albania 76.918 | |
| + | 3 Algeria 73.131 | ||
| + | |||
| + | # Show in a table format using the “table” syntax | ||
| + | > table(data) | ||
| + | region | ||
| + | Amer EAP EuCA MENA SAs SSA | ||
| + | 39 30 50 21 8 49 | ||
| + | |||
</pre> | </pre> | ||
* Assign a specific column to variable | * Assign a specific column to variable | ||
<pre> | <pre> | ||
| − | > | + | > LifeExp=data[,2] |
| + | </pre> | ||
| + | |||
| + | * Assign all columns to variables | ||
| + | <pre> | ||
| + | > attach=(data) | ||
| + | # Now refer each column using the variable name which is the header column name | ||
</pre> | </pre> | ||
* Plotting | * Plotting | ||
<pre> | <pre> | ||
| − | > plot( | + | > plot(LifeExp, xlab='Name', ylab='Score') |
| − | > boxplot( | + | > plot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) |
| + | > boxplot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) | ||
| + | > boxplot(LifeExp, horizontal=TRUE, xlab='Name', ylab='Score', ylim=c(60,80)) | ||
| + | |||
| + | > plot(sort(size_grades), type='b', xlab='Student', ylab='Grade') | ||
| + | |||
</pre> | </pre> | ||
* View basic arithmetic summary data | * View basic arithmetic summary data | ||
<pre> | <pre> | ||
| − | > summary | + | > summary(LifeExp) |
| − | Min. | + | Min. 1st Qu. Median Mean 3rd Qu. Max. |
| + | 47.79 64.67 73.24 69.86 76.65 83.39 | ||
| + | |||
| + | > summary(LifeExp, digits=6) # Show more accurate in digits | ||
| + | </pre> | ||
| + | |||
| + | * Center of the data | ||
| + | <pre> | ||
| + | > mean(DGDifference) | ||
| + | [1] -14.15 | ||
| + | > mean(DGDifference, trim=0.1) # Trim 10% from the top and bottom | ||
| + | [1] -13.82188 | ||
| + | > median(DGDifference) | ||
| + | [1] -13 | ||
</pre> | </pre> | ||
| + | * Spread of data | ||
| + | <pre> | ||
| + | > range(LifeExp) | ||
| + | [1] 47.794 83.394 | ||
| + | > max(LifeExp)-min(LifeExp) | ||
| + | [1] 35.6 | ||
| + | > IQR(LifeExp) # Inter-Quartile Range (range between 1st and 3rd Quartile) | ||
| + | [1] 11.986 | ||
| + | > var(LifeExp) # Variance | ||
| + | [1] 93.48446 | ||
| + | > sd(LifeExp) # Standard deviation | ||
| + | [1] 9.668736 | ||
| + | > sqrt(var(LifeExp)) # Same Standard deviation | ||
| + | [1] 9.668736 | ||
| + | |||
| + | > round(sd(LifeExp)) | ||
| + | [1] 10 | ||
| + | > round(sd(LifeExp),1) # Keep on decimal point | ||
| + | [1] 9.7 | ||
| + | |||
| + | </pre> | ||
| + | |||
| + | * Variables | ||
| + | <pre> | ||
| + | > orig=c(23,4,66,107,12,45) | ||
| + | > trim=sort(orig) | ||
| + | > trim | ||
| + | [1] 4 12 23 45 66 107 | ||
| + | > trim=trim[2:5] | ||
| + | > trim | ||
| + | [1] 12 23 45 66 | ||
| + | |||
| + | > median = median(trim) | ||
| + | > mean = mean(trim) | ||
| + | > range = max(trim) - min(trim) | ||
| + | > iqr = IQR(trim) | ||
| + | > st.dev = sd(trim) | ||
| + | > all_values = c(median, mean, range, iqr, st.dev) | ||
| + | > combined = cbind(all_values, all_values2) # Combine two sets of data | ||
| + | > rownames(all_values) = c('median','mean','range','IRQ','Std.Dev') # Rename row labels | ||
| + | |||
| + | </pre> | ||
| + | |||
| + | * Shape of data | ||
| + | <pre> | ||
| + | # Histogram | ||
| + | > hist(lifeexp) | ||
| + | > hist(lifeexp, breaks=5) # 5 buckets | ||
| + | > hist(lifeexp, breaks=5, xlab = 'Life Expentancy (years)', main='Histogram of Life Expectancies') | ||
| + | |||
| + | </pre> | ||
| + | |||
| + | * Categorical (qualitative) variables | ||
| + | <pre> | ||
| + | |||
| + | </pre> | ||
[[Category: R]] | [[Category: R]] | ||
| − | |||
Latest revision as of 09:50, 9 March 2014
- Set working directory
> setwd('~/Desktop/R_scripts')
- Read data
> data=read.table('Statistic.csv')
> data=read.table('Skeleton.csv', header=TRUE)
> data=read.table('Skeleton.csv', header=T)
# Need two backslashes for Windows file system
> data=read.table('C:\Documents\My R\Life Expentancy.txt')
Error: '\D' used without hex digits in character string starting "'C:\U"
> data=read.table('C:\\Documents\\My R\\Life Expectancy.txt')
- Assign basic data to a variable
> six_grades = c(68, 64, 90, 74, 78, 93) > sort(six_grades) [1] 64 68 74 78 90 93
- Review data
> data
V1 V2
1 Afghanistan 48.673
2 Albania 76.918
3 Algeria 73.131
# Show in a table format using the “table” syntax
> table(data)
region
Amer EAP EuCA MENA SAs SSA
39 30 50 21 8 49
- Assign a specific column to variable
> LifeExp=data[,2]
- Assign all columns to variables
> attach=(data) # Now refer each column using the variable name which is the header column name
- Plotting
> plot(LifeExp, xlab='Name', ylab='Score') > plot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) > boxplot(LifeExp, xlab='Name', ylab='Score', ylim=c(60,80)) > boxplot(LifeExp, horizontal=TRUE, xlab='Name', ylab='Score', ylim=c(60,80)) > plot(sort(size_grades), type='b', xlab='Student', ylab='Grade')
- View basic arithmetic summary data
> summary(LifeExp) Min. 1st Qu. Median Mean 3rd Qu. Max. 47.79 64.67 73.24 69.86 76.65 83.39 > summary(LifeExp, digits=6) # Show more accurate in digits
- Center of the data
> mean(DGDifference) [1] -14.15 > mean(DGDifference, trim=0.1) # Trim 10% from the top and bottom [1] -13.82188 > median(DGDifference) [1] -13
- Spread of data
> range(LifeExp) [1] 47.794 83.394 > max(LifeExp)-min(LifeExp) [1] 35.6 > IQR(LifeExp) # Inter-Quartile Range (range between 1st and 3rd Quartile) [1] 11.986 > var(LifeExp) # Variance [1] 93.48446 > sd(LifeExp) # Standard deviation [1] 9.668736 > sqrt(var(LifeExp)) # Same Standard deviation [1] 9.668736 > round(sd(LifeExp)) [1] 10 > round(sd(LifeExp),1) # Keep on decimal point [1] 9.7
- Variables
> orig=c(23,4,66,107,12,45)
> trim=sort(orig)
> trim
[1] 4 12 23 45 66 107
> trim=trim[2:5]
> trim
[1] 12 23 45 66
> median = median(trim)
> mean = mean(trim)
> range = max(trim) - min(trim)
> iqr = IQR(trim)
> st.dev = sd(trim)
> all_values = c(median, mean, range, iqr, st.dev)
> combined = cbind(all_values, all_values2) # Combine two sets of data
> rownames(all_values) = c('median','mean','range','IRQ','Std.Dev') # Rename row labels
- Shape of data
# Histogram > hist(lifeexp) > hist(lifeexp, breaks=5) # 5 buckets > hist(lifeexp, breaks=5, xlab = 'Life Expentancy (years)', main='Histogram of Life Expectancies')
- Categorical (qualitative) variables