-
Notifications
You must be signed in to change notification settings - Fork 58
/
smoothing.py
2198 lines (1747 loc) · 72.1 KB
/
smoothing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
863
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Apply smoothing to rate computation
[Longer Description]
Author(s):
Myunghwa Hwang [email protected]
David Folch [email protected]
Luc Anselin [email protected]
Serge Rey [email protected]
"""
__author__ = (
"Myunghwa Hwang <[email protected]>, "
"David Folch <[email protected]>, "
"Luc Anselin <[email protected]>, "
"Serge Rey <[email protected]"
)
import warnings
from functools import reduce
import numpy as np
from libpysal.cg import (
KDTree,
LineSegment,
Point,
Ray,
convex_hull,
get_angle_between,
get_bounding_box,
get_point_at_angle_and_dist,
get_points_dist,
get_segment_point_dist,
)
from libpysal.common import requires as _requires
from libpysal.weights.distance import Kernel
from libpysal.weights.spatial_lag import lag_spatial as slag
from libpysal.weights.util import comb, get_points_array
from libpysal.weights.weights import W
from scipy.stats import chi2, gamma, norm, poisson
__all__ = [
"Excess_Risk",
"Empirical_Bayes",
"Spatial_Empirical_Bayes",
"Spatial_Rate",
"Kernel_Smoother",
"Age_Adjusted_Smoother",
"Disk_Smoother",
"Spatial_Median_Rate",
"Spatial_Filtering",
"Headbanging_Triples",
"Headbanging_Median_Rate",
"flatten",
"weighted_median",
"sum_by_n",
"crude_age_standardization",
"direct_age_standardization",
"indirect_age_standardization",
"standardized_mortality_ratio",
"choynowski",
"assuncao_rate",
]
def flatten(l, unique=True): # noqa: E741
"""flatten a list of lists
Parameters
----------
l : list
of lists
unique : boolean
whether or not only unique items are wanted (default=True)
Returns
-------
list
of single items
Examples
--------
Creating a sample list whose elements are lists of integers
>>> l = [[1, 2], [3, 4, ], [5, 6]]
Applying flatten function
>>> flatten(l)
[1, 2, 3, 4, 5, 6]
"""
l = reduce(lambda x, y: x y, l) # noqa: E741
if not unique:
return list(l)
return list(set(l))
def weighted_median(d, w):
"""A utility function to find a median of d based on w
Parameters
----------
d : array
(n, 1), variable for which median will be found
w : array
(n, 1), variable on which d's median will be decided
Notes
-----
d and w are arranged in the same order
Returns
-------
float
median of d
Examples
--------
Creating an array including five integers.
We will get the median of these integers.
>>> d = np.array([5,4,3,1,2])
Creating another array including weight values for the above integers.
The median of d will be decided with a consideration to these weight
values.
>>> w = np.array([10, 22, 9, 2, 5])
Applying weighted_median function
>>> weighted_median(d, w)
4
"""
dtype = [("w", f"{w.dtype}"), ("v", f"{d.dtype}")]
d_w = np.array(list(zip(w, d, strict=True)), dtype=dtype)
d_w.sort(order="v")
reordered_w = d_w["w"].cumsum()
cumsum_threshold = reordered_w[-1] * 1.0 / 2
median_inx = (reordered_w >= cumsum_threshold).nonzero()[0][0]
if reordered_w[median_inx] == cumsum_threshold and len(d) - 1 > median_inx:
return np.sort(d)[median_inx : median_inx 2].mean() # noqa: E203
return np.sort(d)[median_inx]
def sum_by_n(d, w, n):
"""A utility function to summarize a data array into n values
after weighting the array with another weight array w
Parameters
----------
d : array
(t, 1), numerical values
w : array
(t, 1), numerical values for weighting
n : integer
the number of groups
t = c*n (c is a constant)
Returns
-------
: array
(n, 1), an array with summarized values
Examples
--------
Creating an array including four integers.
We will compute weighted means for every two elements.
>>> d = np.array([10, 9, 20, 30])
Here is another array with the weight values for d's elements.
>>> w = np.array([0.5, 0.1, 0.3, 0.8])
We specify the number of groups for which the weighted mean is computed.
>>> n = 2
Applying sum_by_n function
>>> sum_by_n(d, w, n)
array([ 5.9, 30. ])
"""
t = len(d)
h = t // n # must be floor!
d = d * w
return np.array([sum(d[i : i h]) for i in range(0, t, h)]) # noqa: E203
def crude_age_standardization(e, b, n):
"""A utility function to compute rate through crude age standardization
Parameters
----------
e : array
(n*h, 1), event variable measured for each age group across n spatial units
b : array
(n*h, 1), population at risk variable measured for each age
group across n spatial units
n : integer
the number of spatial units
Notes
-----
e and b are arranged in the same order
Returns
-------
: array
(n, 1), age standardized rate
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 2 regions in each of which 4 age groups are available.
The first 4 values are event values for 4 age groups in the region 1,
and the next 4 values are for 4 age groups in the region 2.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same two regions.
The order for entering values is the same as the case of e.
>>> b = np.array([100, 100, 110, 90, 100, 90, 110, 90])
Specifying the number of regions.
>>> n = 2
Applying crude_age_standardization function to e and b
>>> crude_age_standardization(e, b, n)
array([0.2375 , 0.26666667])
"""
r = e * 1.0 / b
b_by_n = sum_by_n(b, 1.0, n)
age_weight = b * 1.0 / b_by_n.repeat(len(e) // n)
return sum_by_n(r, age_weight, n)
def direct_age_standardization(e, b, s, n, alpha=0.05):
"""A utility function to compute rate through direct age standardization
Parameters
----------
e : array
(n*h, 1), event variable measured for each age group across n spatial units
b : array
(n*h, 1), population at risk variable measured for each
age group across n spatial units
s : array
(n*h, 1), standard population for each age group across n spatial units
n : integer
the number of spatial units
alpha : float
significance level for confidence interval
Notes
-----
e, b, and s are arranged in the same order
Returns
-------
list
a list of n tuples; a tuple has a rate and its lower and upper limits
age standardized rates and confidence intervals
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 2 regions in each of which 4 age groups are available.
The first 4 values are event values for 4 age groups in the region 1,
and the next 4 values are for 4 age groups in the region 2.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same two regions.
The order for entering values is the same as the case of e.
>>> b = np.array([1000, 1000, 1100, 900, 1000, 900, 1100, 900])
For direct age standardization, we also need the data for standard population.
Standard population is a reference population-at-risk (e.g., population
distribution for the U.S.) whose age distribution can be used as a benchmarking
point for comparing age distributions across regions (e.g., population
distribution for Arizona and California). Another array including standard
population is created.
>>> s = np.array([1000, 900, 1000, 900, 1000, 900, 1000, 900])
Specifying the number of regions.
>>> n = 2
Applying direct_age_standardization function to e and b
>>> a, b = [i[0] for i in direct_age_standardization(e, b, s, n)]
>>> round(a, 4)
0.0237
>>> round(b, 4)
0.0267
"""
age_weight = (1.0 / b) * (s * 1.0 / sum_by_n(s, 1.0, n).repeat(len(s) // n))
adjusted_r = sum_by_n(e, age_weight, n)
var_estimate = sum_by_n(e, np.square(age_weight), n)
g_a = np.square(adjusted_r) / var_estimate
g_b = var_estimate / adjusted_r
_b = len(b)
_rb = range(0, _b, _b // n)
k = [age_weight[i : i _b // n].max() for i in _rb] # noqa: E203
g_a_k = np.square(adjusted_r k) / (var_estimate np.square(k))
g_b_k = (var_estimate np.square(k)) / (adjusted_r k)
res = []
for i in range(len(adjusted_r)):
if adjusted_r[i] == 0:
upper = 0.5 * chi2.ppf(1 - 0.5 * alpha)
lower = 0.0
else:
lower = gamma.ppf(0.5 * alpha, g_a[i], scale=g_b[i])
upper = gamma.ppf(1 - 0.5 * alpha, g_a_k[i], scale=g_b_k[i])
res.append((adjusted_r[i], lower, upper))
return res
def indirect_age_standardization(e, b, s_e, s_b, n, alpha=0.05):
"""A utility function to compute rate through indirect age standardization
Parameters
----------
e : array
(n*h, 1), event variable measured for each age group across n spatial units
b : array
(n*h, 1), population at risk variable measured for each age
group across n spatial units
s_e : array
(n*h, 1), event variable measured for each age group across
n spatial units in a standard population
s_b : array
(n*h, 1), population variable measured for each age group across
n spatial units in a standard population
n : integer
the number of spatial units
alpha : float
significance level for confidence interval
Notes
-----
e, b, s_e, and s_b are arranged in the same order
Returns
-------
list
a list of n tuples; a tuple has a rate and its lower and upper limits
age standardized rate
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 2 regions in each of which 4 age groups are available.
The first 4 values are event values for 4 age groups in the region 1,
and the next 4 values are for 4 age groups in the region 2.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same two regions.
The order for entering values is the same as the case of e.
>>> b = np.array([100, 100, 110, 90, 100, 90, 110, 90])
For indirect age standardization, we also need the data for standard population
and event. Standard population is a reference population-at-risk (e.g.,
population distribution for the U.S.) whose age distribution can be used as a
benchmarking point for comparing age distributions across regions (e.g.,
popoulation distribution for Arizona and California). When the same concept is
applied to the event variable, we call it standard event (e.g., the number of
cancer patients in the U.S.). Two additional arrays including standard population
and event are created.
>>> s_e = np.array([100, 45, 120, 100, 50, 30, 200, 80])
>>> s_b = np.array([1000, 900, 1000, 900, 1000, 900, 1000, 900])
Specifying the number of regions.
>>> n = 2
Applying indirect_age_standardization function to e and b
>>> [i[0] for i in indirect_age_standardization(e, b, s_e, s_b, n)]
[0.23723821989528798, 0.2610803324099723]
"""
smr = standardized_mortality_ratio(e, b, s_e, s_b, n)
s_r_all = sum(s_e * 1.0) / sum(s_b * 1.0)
adjusted_r = s_r_all * smr
e_by_n = sum_by_n(e, 1.0, n)
log_smr = np.log(smr)
log_smr_sd = 1.0 / np.sqrt(e_by_n)
norm_thres = norm.ppf(1 - 0.5 * alpha)
log_smr_lower = log_smr - norm_thres * log_smr_sd
log_smr_upper = log_smr norm_thres * log_smr_sd
smr_lower = np.exp(log_smr_lower) * s_r_all
smr_upper = np.exp(log_smr_upper) * s_r_all
res = list(zip(adjusted_r, smr_lower, smr_upper, strict=True))
return res
def standardized_mortality_ratio(e, b, s_e, s_b, n):
"""A utility function to compute standardized mortality ratio (SMR).
Parameters
----------
e : array
(n*h, 1), event variable measured for each age group across n spatial units
b : array
(n*h, 1), population at risk variable measured for each age
group across n spatial units
s_e : array
(n*h, 1), event variable measured for each age group across
n spatial units in a standard population
s_b : array
(n*h, 1), population variable measured for each age group
across n spatial units in a standard population
n : integer
the number of spatial units
Notes
-----
e, b, s_e, and s_b are arranged in the same order
Returns
-------
array
(nx1)
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 2 regions in each of which 4 age groups are available.
The first 4 values are event values for 4 age groups in the region 1,
and the next 4 values are for 4 age groups in the region 2.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same two regions.
The order for entering values is the same as the case of e.
>>> b = np.array([100, 100, 110, 90, 100, 90, 110, 90])
To compute standardized mortality ratio (SMR),
we need two additional arrays for standard population and event.
Creating s_e and s_b for standard event and population, respectively.
>>> s_e = np.array([100, 45, 120, 100, 50, 30, 200, 80])
>>> s_b = np.array([1000, 900, 1000, 900, 1000, 900, 1000, 900])
Specifying the number of regions.
>>> n = 2
Applying indirect_age_standardization function to e and b
>>> a, b = standardized_mortality_ratio(e, b, s_e, s_b, n)
>>> round(a, 4)
2.4869
>>> round(b, 4)
2.7368
"""
s_r = s_e * 1.0 / s_b
e_by_n = sum_by_n(e, 1.0, n)
expected = sum_by_n(b, s_r, n)
smr = e_by_n * 1.0 / expected
return smr
def choynowski(e, b, n, threshold=None):
"""Choynowski map probabilities [Choynowski1959]_ .
Parameters
----------
e : array(n*h, 1)
event variable measured for each age group across n spatial units
b : array(n*h, 1)
population at risk variable measured for each age group across n spatial units
n : integer
the number of spatial units
threshold : float
Returns zero for any p-value greater than threshold
Notes
-----
e and b are arranged in the same order
Returns
-------
: array (nx1)
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 2 regions in each of which 4 age groups are available.
The first 4 values are event values for 4 age groups in the region 1,
and the next 4 values are for 4 age groups in the region 2.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same two regions.
The order for entering values is the same as the case of e.
>>> b = np.array([100, 100, 110, 90, 100, 90, 110, 90])
Specifying the number of regions.
>>> n = 2
Applying indirect_age_standardization function to e and b
>>> a,b = choynowski(e, b, n)
>>> round(a, 3)
0.304
>>> round(b, 3)
0.294
"""
e_by_n = sum_by_n(e, 1.0, n)
b_by_n = sum_by_n(b, 1.0, n)
r_by_n = sum(e_by_n) * 1.0 / sum(b_by_n)
expected = r_by_n * b_by_n
p = []
for index, i in enumerate(e_by_n):
if i <= expected[index]:
p.append(poisson.cdf(i, expected[index]))
else:
p.append(1 - poisson.cdf(i - 1, expected[index]))
if threshold:
p = [i if i < threshold else 0.0 for i in p]
return np.array(p)
def assuncao_rate(e, b):
"""The standardized rates where the mean and stadard deviation used for
the standardization are those of Empirical Bayes rate estimates
The standardized rates resulting from this function are used to compute
Moran's I corrected for rate variables [Choynowski1959]_ .
Parameters
----------
e : array(n, 1)
event variable measured at n spatial units
b : array(n, 1)
population at risk variable measured at n spatial units
Notes
-----
e and b are arranged in the same order
Returns
-------
: array (nx1)
Examples
--------
Creating an array of an event variable (e.g., the number of cancer patients)
for 8 regions.
>>> e = np.array([30, 25, 25, 15, 33, 21, 30, 20])
Creating another array of a population-at-risk variable (e.g., total population)
for the same 8 regions.
The order for entering values is the same as the case of e.
>>> b = np.array([100, 100, 110, 90, 100, 90, 110, 90])
Computing the rates
>>> assuncao_rate(e, b)[:4]
array([ 1.03843863, -0.04099089, -0.56250375, -1.73061861])
"""
y = e * 1.0 / b
e_sum, b_sum = sum(e), sum(b)
ebi_b = e_sum * 1.0 / b_sum
s2 = sum(b * ((y - ebi_b) ** 2)) / b_sum
ebi_a = s2 - ebi_b / (float(b_sum) / len(e))
ebi_v_raw = ebi_a ebi_b / b
ebi_v = np.where(ebi_v_raw < 0, ebi_b / b, ebi_v_raw)
return (y - ebi_b) / np.sqrt(ebi_v)
class _Smoother:
"""
This is a helper class that implements things that all smoothers should do.
Right now, the only thing that we need to propagate is the by_col function.
TBQH, most of these smoothers should be functions, not classes (aside from
maybe headbanging triples), since they're literally only inits one
attribute.
"""
def __init__(self):
pass
@classmethod
def by_col(cls, df, e, b, inplace=False, **kwargs):
"""
Compute smoothing by columns in a dataframe.
Parameters
-----------
df : pandas.DataFrame
a dataframe containing the data to be smoothed
e : string or list of strings
the name or names of columns containing event variables to be
smoothed
b : string or list of strings
the name or names of columns containing the population
variables to be smoothed
inplace : bool
a flag denoting whether to output a copy of `df` with the
relevant smoothed columns appended, or to append the columns
directly to `df` itself.
**kwargs: optional keyword arguments
optional keyword options that are passed directly to the
smoother.
Returns
---------
a copy of `df` containing the columns. Or, if `inplace`, this returns
None, but implicitly adds columns to `df`.
"""
msg = (
"The `.by_col()` methods are deprecated and will be "
"removed in a future version of `esda`."
)
warnings.warn(msg, FutureWarning, stacklevel=True)
if not inplace:
new = df.copy()
cls.by_col(new, e, b, inplace=True, **kwargs)
return new
if isinstance(e, str):
e = [e]
if isinstance(b, str):
b = [b]
if len(b) == 1 and len(e) > 1:
b = b * len(e)
try:
assert len(e) == len(b)
except AssertionError:
raise ValueError(
"There is no one-to-one mapping between event "
"variable and population at risk variable!"
) from None
for ei, bi in zip(e, b, strict=True):
ename = ei
bname = bi
ei = df[ename]
bi = df[bname]
outcol = "_".join(("-".join((ename, bname)), cls.__name__.lower()))
df[outcol] = cls(ei, bi, **kwargs).r
class Excess_Risk(_Smoother): # noqa: N801
"""Excess Risk
Parameters
----------
e : array (n, 1)
event variable measured across n spatial units
b : array (n, 1)
population at risk variable measured across n spatial units
Attributes
----------
r : array (n, 1)
execess risk values
Examples
--------
Reading data in stl_hom.csv into stl to extract values
for event and population-at-risk variables
>>> import libpysal
>>> stl = libpysal.io.open(libpysal.examples.get_path('stl_hom.csv'), 'r')
The 11th and 14th columns in stl_hom.csv includes the number
of homocides and population. Creating two arrays from these columns.
>>> stl_e, stl_b = np.array(stl[:,10]), np.array(stl[:,13])
Creating an instance of Excess_Risk class using stl_e and stl_b
>>> er = Excess_Risk(stl_e, stl_b)
Extracting the excess risk values through
the property r of the Excess_Risk instance, er
>>> er.r[:10]
array([[0.20665681],
[0.43613787],
[0.42078261],
[0.22066928],
[0.57981596],
[0.35301709],
[0.56407549],
[0.17020994],
[0.3052372 ],
[0.25821905]])
"""
def __init__(self, e, b):
e = np.asarray(e).reshape(-1, 1)
b = np.asarray(b).reshape(-1, 1)
r_mean = e.sum() * 1.0 / b.sum()
self.r = e * 1.0 / (b * r_mean)
class Empirical_Bayes(_Smoother): # noqa: N801
"""Aspatial Empirical Bayes Smoothing
Parameters
----------
e : array (n, 1)
event variable measured across n spatial units
b : array (n, 1)
population at risk variable measured across n spatial units
Attributes
----------
r : array (n, 1)
rate values from Empirical Bayes Smoothing
Examples
--------
Reading data in stl_hom.csv into stl to extract values
for event and population-at-risk variables
>>> import libpysal
>>> stl_ex = libpysal.examples.load_example('stl')
>>> stl = libpysal.io.open(stl_ex.get_path('stl_hom.csv'), 'r')
The 11th and 14th columns in stl_hom.csv includes
the number of homocides and population. Creating two arrays from these columns.
>>> stl_e, stl_b = np.array(stl[:,10]), np.array(stl[:,13])
Creating an instance of Empirical_Bayes class using stl_e and stl_b
>>> eb = Empirical_Bayes(stl_e, stl_b)
Extracting the risk values through the property
r of the Empirical_Bayes instance, eb
>>> eb.r[:10]
array([[2.36718950e-05],
[4.54539167e-05],
[4.78114019e-05],
[2.76907146e-05],
[6.58989323e-05],
[3.66494122e-05],
[5.79952721e-05],
[2.03064590e-05],
[3.31152999e-05],
[3.02748380e-05]])
"""
def __init__(self, e, b):
e = np.asarray(e).reshape(-1, 1)
b = np.asarray(b).reshape(-1, 1)
e_sum, b_sum = e.sum() * 1.0, b.sum() * 1.0
r_mean = e_sum / b_sum
rate = e * 1.0 / b
r_variat = rate - r_mean
r_var_left = (b * r_variat * r_variat).sum() * 1.0 / b_sum
r_var_right = r_mean * 1.0 / b.mean()
r_var = r_var_left - r_var_right
weight = r_var / (r_var r_mean / b)
self.r = weight * rate (1.0 - weight) * r_mean
class _Spatial_Smoother(_Smoother): # noqa: N801
"""
This is a helper class that implements things that all the things that
spatial smoothers should do.
.
Right now, the only thing that we need to propagate is the by_col function.
TBQH, most of these smoothers should be functions, not classes (aside from
maybe headbanging triples), since they're literally only inits one
attribute.
"""
def __init__(self):
pass
@classmethod
def by_col(cls, df, e, b, w=None, inplace=False, **kwargs):
"""
Compute smoothing by columns in a dataframe.
Parameters
-----------
df : pandas.DataFrame
a dataframe containing the data to be smoothed
e : string or list of strings
the name or names of columns containing event variables to be
smoothed
b : string or list of strings
the name or names of columns containing the population
variables to be smoothed
w : pysal.weights.W or list of pysal.weights.W
the spatial weights object or objects to use with the
event-population pairs. If not provided and a weights object
is in the dataframe's metadata, that weights object will be
used.
inplace : bool
a flag denoting whether to output a copy of `df` with the
relevant smoothed columns appended, or to append the columns
directly to `df` itself.
**kwargs: optional keyword arguments
optional keyword options that are passed directly to the
smoother.
Returns
---------
a copy of `df` containing the columns. Or, if `inplace`, this returns
None, but implicitly adds columns to `df`.
"""
msg = (
"The `.by_col()` methods are deprecated and will be "
"removed in a future version of `esda`."
)
warnings.warn(msg, FutureWarning, stacklevel=2)
if not inplace:
new = df.copy()
cls.by_col(new, e, b, w=w, inplace=True, **kwargs)
return new
if isinstance(e, str):
e = [e]
if isinstance(b, str):
b = [b]
if w is None:
found = False
for _ in df._metadata:
w = df.__dict__.get(w, None)
if isinstance(w, W):
found = True
if not found:
raise ValueError(
"Weights not provided and no weights attached to frame! "
"Please provide a weight or attach a weight to the dataframe."
)
if isinstance(w, W):
w = [w] * len(e)
if len(b) == 1 and len(e) > 1:
b = b * len(e)
try:
assert len(e) == len(b)
except AssertionError:
raise ValueError(
"There is no one-to-one mapping between event "
"variable and population at risk variable!"
) from None
for ei, bi, wi in zip(e, b, w, strict=True):
ename = ei
bname = bi
ei = df[ename]
bi = df[bname]
outcol = "_".join(("-".join((ename, bname)), cls.__name__.lower()))
df[outcol] = cls(ei, bi, w=wi, **kwargs).r
class Spatial_Empirical_Bayes(_Spatial_Smoother): # noqa: N801
"""Spatial Empirical Bayes Smoothing
Parameters
----------
e : array (n, 1)
event variable measured across n spatial units
b : array (n, 1)
population at risk variable measured across n spatial units
w : spatial weights instance
Attributes
----------
r : array (n, 1)
rate values from Empirical Bayes Smoothing
Examples
--------
Reading data in stl_hom.csv into stl to extract values
for event and population-at-risk variables
>>> import libpysal
>>> stl_ex = libpysal.examples.load_example('stl')
>>> stl = libpysal.io.open(stl_ex.get_path('stl_hom.csv'), 'r')
The 11th and 14th columns in stl_hom.csv includes the number
of homocides and population. Creating two arrays from these columns.
>>> stl_e, stl_b = np.array(stl[:,10]), np.array(stl[:,13])
Creating a spatial weights instance by reading in stl.gal file.
>>> stl_w = libpysal.io.open(stl_ex.get_path('stl.gal'), 'r').read()
Ensuring that the elements in the spatial weights instance are ordered
by the given sequential numbers from 1 to the number of observations
in stl_hom.csv
>>> if not stl_w.id_order_set: stl_w.id_order = range(1,len(stl) 1)
Creating an instance of Spatial_Empirical_Bayes class
using stl_e, stl_b, and stl_w
>>> from esda.smoothing import Spatial_Empirical_Bayes
>>> s_eb = Spatial_Empirical_Bayes(stl_e, stl_b, stl_w)
Extracting the risk values through the property r of s_eb
>>> s_eb.r[:10]
array([[4.01485749e-05],
[3.62437513e-05],
[4.93034844e-05],
[5.09387329e-05],
[3.72735210e-05],
[3.69333797e-05],
[5.40245456e-05],
[2.99806055e-05],
[3.73034109e-05],
[3.47270722e-05]])
"""
def __init__(self, e, b, w):
if not w.id_order_set:
raise ValueError(
"The `id_order` of `w` must be set to "
"align with the order of `e` and `b`."
)
e = np.asarray(e).reshape(-1, 1)
b = np.asarray(b).reshape(-1, 1)
r_mean = Spatial_Rate(e, b, w).r
rate = e * 1.0 / b
r_var_left = np.ones_like(e) * 1.0
ngh_num = np.ones_like(e)
bi = slag(w, b) b
for i, idv in enumerate(w.id_order):
ngh = list(w[idv].keys()) [idv]
nghi = [w.id2i[k] for k in ngh]
ngh_num[i] = len(nghi)
v = sum(np.square(rate[nghi] - r_mean[i]) * b[nghi])
r_var_left[i] = v
r_var_left = r_var_left / bi
r_var_right = r_mean / (bi / ngh_num)
r_var = r_var_left - r_var_right
r_var[r_var < 0] = 0.0
self.r = r_mean (rate - r_mean) * (r_var / (r_var (r_mean / b)))
class Spatial_Rate(_Spatial_Smoother): # noqa: N801
"""Spatial Rate Smoothing
Parameters
----------
e : array (n, 1)
event variable measured across n spatial units
b : array (n, 1)
population at risk variable measured across n spatial units