Probablity Distributions

Probablity Distributions in Pandas

# import pandas and numpy
import pandas as pd
import numpy as np

# numpy has some built-in distributions
# Let's do trial one time & see the output
np.random.binomial(1,0.5)

# now let's do it 1000 times and find the average
np.random.binomial(1000,0.5)/1000

0.493

# let's find our probablity of occuring tornado two days in a row using binomial distribution
chance_of_tornado = 0.01 # probablity of tornado on any day

tornado_events = np.random.binomial(1, chance_of_tornado, 1000000) # trail size is set 10000000

# find consecutative two ones in return list tornado_events
two_days_in_a_row = 0
for j in range(1,len(tornado_events)-1):
    if tornado_events[j]==1 and tornado_events[j-1]==1:
        two_days_in_a_row+=1

print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))

102 tornadoes back to back in 2739.72602739726 years

# uniform distribution 
np.random.uniform(0, 1)

0.2962707472170836

# uniform distribution with lower upper bound 
np.random.uniform(0, .75)

0.14042166159994887

Formula for standard deviation $$\sqrt{\frac{1}{N} \sum_{i=1}^N (x_i - \overline{x})^2}$$

# normal distribution
distribution = np.random.normal(0.75,size=1000)

# calculating standard distribution from formula
np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))

0.96359223265251603

# calculating with inbuilt function
np.std(distribution)

0.96359223265251603

import scipy.stats as stats
stats.kurtosis(distribution)

0.0886272943714217

# finding skew of distribution 
stats.skew(distribution)

-0.12279299911663993

# let's see how skew varies in case of chi squared distribution if degree of freedom is changed
# skew for degree of freedom 2
chi_squared_df2 = np.random.chisquare(2, size=10000)
stats.skew(chi_squared_df2)

2.110549699727957

# skew for degree of freedom 5
chi_squared_df5 = np.random.chisquare(5, size=10000)
stats.skew(chi_squared_df5)

1.1764301229675125

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', 
                  label=['2 degrees of freedom','5 degrees of freedom'])
plt.legend(loc='upper right')

graphplot

Credits Introduction to Data Science in Python