-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathresampling_statistics.py
120 lines (100 loc) · 4.45 KB
/
resampling_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Functions for computer-intensive methods in statistics
Copyright 2009 Maurice Ling <[email protected]>
Licence: GNU General Public License version 3
Bactome package is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
def randomization(statfunction, fulldata, testdata, iteration=1000):
"""
Test harness for statistical randomization test, also known as
permutation test or bootstrapping.
Pitman, EJG. 1938. Significance tests which may be applied to samples
from any population. Part III. The analysis of variance test.
Biometrika 29: 322-335.
Manly, Bryan FJ. 2007. Randomization, Bootstrap and Monte Carlo
Methods in Biology (3rd edition). Chapman & Hall/CRC.
Parameters:
statfunction = Python function to calculate the test statistic.
fulldata = list of entire data set (for randomization)
testdata = list of sample data of interest (for testing)
iteration = number of times random samples are drawn.
Default = 1000
Returns:
Proportion where statistic from randomized sample is greater
or equal to the statistic from testdata. This can be taken as
p-value for the test
"""
from copads.Operations import sample_wr
test_statistic = statfunction(testdata)
randomized_statistic = [statfunction(sample_wr(fulldata, len(testdata)))
for i in range(iteration)]
#print randomized_statistic, test_statistic
more = [1 for x in randomized_statistic
if x >= test_statistic]
return sum(more) / float(len(randomized_statistic))
def sample_bootstrap(data, iteration=1000):
"""
Calculates bootstrapped means and standard deviation.
Efron, Bradley. 1979. Bootstrap methods: Another look at the jackknife.
The Annals of Statistics 7:1-26.
Manly, Bryan FJ. 2007. Randomization, Bootstrap and Monte Carlo
Methods in Biology (3rd edition). Chapman & Hall/CRC.
Parameters:
data = list of data to generate bootstrapped samples from.
iterations = number of times bootstrapped samples are drawn.
Default = 1000
Returns:
(bootstrapped means, bootstrapped standard deviation)
"""
from copads.Operations import sample_wr
len_data = len(data)
pool = [sum(sample_wr(data, len_data)) / float(len_data)
for x in range(iteration)]
poolmeans = float(sum(pool)) / float(len(pool))
div = [(x - poolmeans)**2 for x in pool]
stdev = math.sqrt(sum(div)) / (len(pool) - 1)
return (poolmeans, stdev)
def sample_jackknife(data):
"""
Estimates mean and standard deviation from a single sample
using Jackknife method.
Wu, CFJ. 1986. Jackknife, Bootstrap and other resampling methods
in regression analysis. The Annals of Statistics 14:1261–1295.
Parameters:
data = list of data to generate bootstrapped samples from.
Returns:
(estimated means, estimated standard deviation)
"""
len_data = len(data)
pool = [sum(data.pop(i)) / float(len_data -1)
for i in range(len_data)]
poolmeans = float(sum(pool)) / float(len(pool))
div = [(x - poolmeans)**2 for x in pool]
stdev = math.sqrt(sum(div)) / (len(pool) - 1)
return (poolmeans, stdev)
def electrophoresis_mw(marker, samples):
"""
Parameters:
marker = data of the marker lane in the format of
[list of migration distance, list of molecular weights]
samples =
"""
from copads.SampleStatistics import TwoSample
if len(marker[0]) != len(marker[1]):
raise AttributeError, 'Number of molecular weights not equal to the \
number of migration distances'
marker[1] = [math.log10(x) for x in marker[1]]
(gradient, intercept) = TwoSample(marker[0], 'dist',
marker[1], 'mw').linear_regression()
return [[10 ** (gradient * band) + 10 ** intercept
for band in sample]
for sample in samples]