### R tutorial for Normal, t and Triangular Distribution ### # Set working directory# setwd("~/Desktop/UCL/IIF/R Workshop/Session 2") # Install required packages# install.packages("EnvStats") install.packages("tigerstats") # Load required packages# library(tigerstats) # Normal Distribution library(EnvStats) # Triangular Distribution ### Normal distribution # For all *norm code (e.g. pnorm, qnorm, rnorm), we need 2 parameters: mean and sd(standard deviation). pnorm(q = 66, mean = 70, sd = 3, lower.tail = T) # pnorm gives the distribution function; # It computes random variable X~Norm(70,9); # lower.tail = T take care of the "less than" part of the requested probability, i.e. :P(X<66); # mean=70 specifies the mean μ of X; # sd=3 specifies the standard deviation σ of X. pnorm(q = 69, mean = 70, sd = 3, lower.tail = F) # For the same random variable X∼Norm(70,9), suppose we want the probability that X is greater than 69, i.e.:P(X>69). pnorm(72, 70, 3) - pnorm(68, 70, 3) # For the same random variable X∼Norm(70,9), suppose we want the probability that X is between 68 and 72, i.e.: P(6873). qnorm(0.85, 70, 3, lower.tail = T) # [1] 73.1093 # qnorm gives the qunantile function, which is the opposite of pnorm; # For example, suppose you want to find that 85th percentile of a normal distribution whose mean is 70 and whose standard deviation is 3; # The value 73.1093 is indeed the 85th percentile, in the sense that 85% of the values in a population that is normally distributed with mean 70 and standard deviation 3 will lie below 73.1093. N1 <- rnorm(n = 1000, mean = 70, sd = 3) # Creat 1000 random numbers follow X~Norm(70,9); hist(N1, freq = F, main = "Normal Distribution\nNorm(70,9)") N1x <- seq (min(N1), max(N1), 0.001) N1y <- dnorm(N1x, mean = mean(N1), sd = sd(N1)) # dnorm computes the probability density of the Norm(mean(N1), sd(N1)) distribution. lines(N1x, N1y) # Plot the distribution: Histogram with normal curve of X~Norm(70,3). # We can do more operations with simple code by using package "tigerstats". pnormGC(bound=66, region="below", mean=70, sd=3) # Same as pnorm(q = 66, mean = 70, sd = 3, lower.tail = T) above; # plot bound=66 tells pnormGC() where to stop; # region="below" takes care of the “less than” part of the requested probability; # mean=70 specifies the mean μ of X; # sd=3 specifies the standard deviation σ of X. pnormGC(bound=66,region="below", mean=70,sd=3,graph=TRUE) # Same as above, except this code creats a graph, by setting the argument graph to TRUE; # It shows the probability distribution of X along with a shaded area that represents the requested probability. pnormGC(bound=69,region="above", mean=70,sd=3,graph=TRUE) # Same as pnorm(q = 69, mean = 70, sd = 3, lower.tail = F) above. pnormGC(bound=c(68,72),region="between",mean=70,sd=3,graph=TRUE) # Same as pnorm(72, 70, 3) - pnorm(68, 70, 3) above, but neater. pnormGC(bound=c(66,73),region="outside",mean=70,sd=3,graph=TRUE) # Same as pnorm(66, 70, 3, lower.tail = T) + pnorm(73, 70, 3, lower.tail = F) above, but neater. qnormGC(0.85,region="below",mean=70,sd=3) # The function qnormGC()aims to do the opposite: given an area, find the boundary value that determines this area; # Same as qnorm(0.85, 70, 3, lower.tail = T) above #tips: Quite often you will want to check your understanding by making a graph of the known percent, with the bounding percentile on the x-axis: # Example # Suppose that class scores are normally distributed, and that the mean class score is 1000, and the standard deviation of all class scores is 100. How high must you score so that only 10% of the population scores higher than you? qnormGC(0.10,region="above",mean=1000,sd=100,graph=TRUE) # An Area Between # Find a positive number z so that the area under the standard normal curve between −z and z is 0.95. qnormGC(0.95,region="between",graph=TRUE) # An Area Outside #Find a positive number z so that the area under the standard normal curve before −z plus the area after z is 0.10. qnormGC(0.10,region="outside",graph=TRUE) # Moment # We have random variable A~Norm(70,9) A <- rnorm(1000, 70, 9) # First Moment: Expected Value mean(A) # Second Moment: Variance var(A) # Third Moment: Skewness skewness(A) # Fourth Moment: Kurtosis kurtosis(A) # Calculate p-value of Normal Distribution Norm_p_val <- function(xbar, mu, var, n){ z <- (xbar - mu)/(sqrt(var/sqrt(n))) p <- 2*pnorm(-abs(z)) return(p) } # xbar = mean of the sample; mu = population mean; var = population variance; n = number of sample. Norm_p_val(xbar = 68, mu = 70,var = 9,n = 150) # Example ### Student's t-distribution # For all *t code (e.g. pt, qt, rt), we need one parameter: df(degrees of freedoms) pt(q = 1.5, df = 15, lower.tail = T) # pt is similiar to pnorm, but it works for distribution function of t-distribution; # Here computes random variable X follows t-distribution with 15 degrees of freedom; # df = 15 specifies the degrees of freedom of X; # lower.tail = T has the same meaning as it appears in the pnorm code above; # There is a different between pnorm and pt; # In pnorm, q = sample mean of X; # In pt, q = t, t = (sample mean of X - μ)/ sample standard deiation/ sqrt(n). # This code computes P(t< 1.5). pt(q = 1.5, df = 15, lower.tail = F) # This code computes P(t>1.5), with the same random variable X as above. pt(1.3, 15) - pt(0.9, 15) # For the same random variable X as above, suppose we want the probability that t is between 1.3 and 0.9, i.e.: P(0.90.8). qt(0.85, 15, lower.tail = T) # [1] 1.073531 # qt gives the qunantile function, which is the opposite of pt; # For example, suppose you want to find that 85th percentile of a t-distribution whose degree of freedom is 15; t1 <- rt(1000, df = 15) # Creat 1000 random numbers follow t-distribution with 15 degrees of freedom; hist(t1, freq = F, main = "t-distribution\ndf = 15") t1x <- seq (min(t1), max(t1), 0.001) t1y <- dt(t1x, 15) # dt computes the probability density of the t-distribution with 15 degress of freedom; lines(t1x, t1y) # Plot the distribution: Histogram with t-distribution curve which degrees of freedom is 15. # Calculate p-value of t-distribution t_p_val <- function(xbar, mu, s, n){ t <- (xbar - mu)/(s/sqrt(n)) p <- 2*pt(-abs(t), df = n-1) return(p) } # xbar = mean of the sample; mu = population mean; var = population variance; n = number of sample. t_p_val(69, 70, 3, 20) # Example ### Triangular Distribution # For all *tri code (e.g. ptri, qtri, rtri), we need three parameters: min(minimum), max(maximum), mode. ptri(q = 15, min = 8, max = 20, mode = 16) # P(X>15), given X~Tri(min=8, max=20, mode=16). 1 - ptri(15, 8, 20, 16) # P(X<15), with the same random variable X as above. # In package "EnvStats", *tri code does not involve setting of "lower.tail"; # Therefore, we calculate P(X>15) by 1 - P(X<15). ptri(18, 8, 20, 16) - ptri(12, 8, 20, 16) # For the same random variable X as above, suppose we want the probability that X is between 18 and 12, i.e.: P(1818). qtri(0.85, 8, 20, 16) # [1] 17.31672 # qtri gives the qunantile function, which is the opposite of ptri; # For example, suppose you want to find that 85th percentile of Tri(8, 20, 16) # The value 17.31672 is indeed the 85th percentile, in the sense that 85% of the values in a population that is triangularly distributed with with min=8, max=20, mode=16 will lie below 17.31672. Tr1 <- rtri(1000, 8, 20, 16) # Creat 1000 random numbers follow Tri(8, 20, 16) hist(Tr1, freq = F, main = "Triangular distribution\nTri(8, 20, 16") Tr1x <- seq (min(Tr1), max(Tr1), 0.001) Mode <- function(x) { ux <- unique(x) ux[which.max(tabulate(match(x, ux)))] } # Mode function # Reference to Ken Williams (https://stackoverflow.com/questions/2547402/is-there-a-built-in-function-for-finding-the-mode) Tr1y <- dtri(Tr1x, min(Tr1), max(Tr1), Mode(Tr1)) # dt computes the probability density of the t-distribution with 15 degress of freedom; lines(Tr1x, Tr1y) # Plot the distribution: Histogram with t-distribution curve which degrees of freedom is 15. #######################################################################################