Attachment 'Benford.py'
Download 1 """
2 Benny Distribution Checker
3
4 AUTHORS:
5
6 - Amy Feaver (2013-07-13)
7
8 - Anne Ho (2013-07-13)
9
10 Michelle Manes, Alina Bucur, Kate Thompson
11
12 """
13
14 import sage.libs.all
15
16 import scipy
17 import numpy
18
19 from scipy import stats
20 from numpy import array
21
22 import matplotlib.pyplot
23
24
25
26
27 def create_Benford_prediction(DataList, base):
28 """
29 INPUT:
30
31 - ``DataList`` -- a list of real numbers
32
33 - ``base`` -- a real number greater than or equal to 2
34
35 OUTPUT:
36
37 A list of length ceil(b), such that for 0 <= i <= ceil(b)-1, the ith position of the list contains the predicted number of items in the list
38 which should start with the digit i under the Benford distribution (when the numbers in DataList are converted to the specified base).
39
40 EXAMPLES:
41
42 sage: Benford_prediction([i for i in range(40)],10)
43 [0, 12.0411998265592, 7.04365036222725, 4.99754946433200, 3.87640052032226, 3.16724984190499, 2.67787158522453, 2.31967787910747, 2.04610089789525, 1.83029962242701]
44
45 sage: Benford_prediction([sqrt(2)*i^2 for i in range(3,81)],5.4402)
46 [0, 31.9193381529151, 18.6716158672934, 13.2477222856217, 10.2757317216334, 8.39588414565996]
47
48 """
49 b = base
50 size = len(DataList)
51 first_digit_Benford = []
52 first_digit_Benford.append(0)
53 c = ceil(b)
54 for d in range(1,c):
55 q =log(1+1.0/d, b).n()
56 first_digit_Benford.append((q*size/log(c,b)).n())
57 return first_digit_Benford
58
59
60
61 def get_first_digit_count(DataList, base):
62 """
63 INPUT:
64
65 - ``DataList`` -- a list of real numbers
66
67 - ``base`` -- a real number greater than or equal to 2
68
69 OUTPUT:
70
71 A list of length ceil(b), such that for 0 <= i <= ceil(b)-1, the ith position of the list contains the number of items in the list which
72 start with the digit i (when the numbers in DataList are converted to the specified base).
73
74 EXAMPLES:
75
76 sage: Benford_count([0,-1,2,3,75],35/4)
77 [1, 1, 1, 1, 0, 0, 0, 0, 1]
78
79 sage: Benford_count([i for i in range(30)],10)
80 [1, 11, 11, 1, 1, 1, 1, 1, 1, 1]
81
82 """
83
84 b = base
85 c = ceil(b)
86
87 #Create first_digit_data; a list of length ceil(b), where each entry is initialized to 0.
88 first_digit_data = []
89 for i in range(c):
90 first_digit_data.append(0)
91
92 for num in DataList:
93 if num == 0:
94 first_digit_data[0] += 1
95 else:
96
97
98
99 #We are trying to avoid loss of precision, so each number num in DataList is viewed as a real number, n.
100 #The value e is the highest power of b in n, and M is the mantissa (i.e. n=M*b^e).
101 n = RR(abs(num))
102
103 l = RR(log(n,b)).n()
104
105 if (l-l.floor())<0.0000000005 or (l.ceil()-l)<0.0000000005:
106 power_of_b=1.0
107 new_e = 0.0
108
109 while power_of_b <= n/b:
110 power_of_b = power_of_b*b
111 new_e += 1.0
112 e = new_e
113 else:
114 e = l.floor()
115
116 M = (n/b**e).n()
117 first_digit= M.floor()
118 first_digit_data[first_digit] += 1
119 return first_digit_data
120
121
122
123 class BenfordChecker:
124
125
126 def __init__(self,data_list,base):
127
128 if len(data_list)==0:
129 raise ValueError, "data list must be nonempty"
130
131 if base<=1:
132 raise ValueError, "base must be greater than 1"
133
134 self.__data_list = data_list
135 self.__base = base
136 self.__Benny = create_Benford_prediction(data_list,base)
137 self.__first_digit_count = get_first_digit_count(data_list,base)
138
139
140 def data_list(self):
141 return self.__data_list
142
143 def base(self):
144 return self.__base
145
146 def Benford_prediction(self):
147 return self.__Benny
148
149 def first_digit_count(self):
150 return self.__first_digit_count
151
152
153 def Benford_chi_square(self):
154 """
155 INPUT:
156
157 - DataList -- a list of real numbers
158
159 - base -- a real number greater than or equal to 2
160
161 OUTPUT:
162
163 A list containing, first, the chi-squared value and, second, the associated p-value.
164
165 EXAMPLES:
166
167 sage: Benford_chi_square([1,2,3,4,5],3)
168 (0.61378226057799101, 0.4333672659230694)
169
170 sage: Benford_chi_square([i for i in range(30)],10)
171 (11.78779211948263, 0.16092617606270504)
172
173 """
174 prediction = list(self.__Benny)
175 count = list(self.__first_digit_count)
176 del prediction[0]
177 del count[0]
178 cs = scipy.stats.chisquare(numpy.array(count),numpy.array(prediction))
179 return cs
180
181
182 def bar_graph(self):
183 N = ceil(self.__base)
184 ind = numpy.arange(N)
185 width = .35
186 p1 = matplotlib.pyplot.bar(ind, self.__Benny, width, color='#669966')
187 p2 = matplotlib.pyplot.bar(ind+.35, self.__first_digit_count, width, color='#66CCCC')
188 matplotlib.pyplot.title('Benford Distribution')
189 matplotlib.pyplot.legend( (p1[0], p2[0]), ('Prediction', 'Count') )
190 matplotlib.pyplot.savefig('Graph.png')
191 matplotlib.pyplot.close()
192
193 def line_graph(self):
194 p1 = matplotlib.pyplot.plot(self.__Benny, color='#669966')
195 p2 = matplotlib.pyplot.plot(self.__first_digit_count, color='#66CCCC')
196 matplotlib.pyplot.title('Benford Distribution')
197 matplotlib.pyplot.legend( (p1[0], p2[0]), ('Prediction', 'Count') )
198 matplotlib.pyplot.savefig('Graph.png')
199 matplotlib.pyplot.close()
200
201
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.