forked from mathematical-systems/clml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
association-rule.cl
409 lines (385 loc) · 18.1 KB
/
association-rule.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
;; association-rule algorithm
(defpackage :association-rule
(:nicknames :assoc)
(:use :cl
:hjs.util.vector
:hjs.learn.read-data)
(:export :association-analyze
:%association-analyze
:%association-analyze-apriori
:%association-analyze-da
:%association-analyze-ap-genrule
:%association-analyze-da-ap-genrule
:%association-analyze-fp-growth
:%association-analyze-eclat
:%association-analyze-lcm
:assoc-result-rules
:assoc-result-header
))
(in-package :association-rule)
(defclass assoc-result-dataset ()
((rules :initarg :rules
:accessor rules
:accessor assoc-result-rules)
(thresholds :initarg :thresholds :accessor thresholds)
(rule-length :initarg :rule-length :accessor rule-length)
(header :allocation :class
:accessor header
:accessor assoc-result-header
:initform
#("premise" "conclusion" "support" "confidence" "lift" "conviction"))))
(defmethod print-object ((d assoc-result-dataset) stream)
(print-unreadable-object (d stream :type t :identity nil))
(format stream "~&THRESHOLDS: ~{~A ~A~^~T| ~}~%"
(loop for index in '("SUPPORT" "CONFIDENCE" "LIFT" "CONVICTION")
for val in (thresholds d)
append `(,index ,val)))
(format stream "~&RULE-LENGTH: ~A~%" (rule-length d))
(format stream "~&RESULT: ~A RULES~%" (length (rules d))))
(defun make-assoc-result (rules support confidence lift conviction rule-length)
(assert (notany #'minusp `(,support ,confidence ,lift ,conviction)))
(assert (> rule-length 1))
(make-instance 'assoc-result-dataset
:rules rules :thresholds `(,support ,confidence ,lift ,conviction)
:rule-length rule-length))
(defmethod assoc-data-out ((d assoc-result-dataset) stream
&optional (control-string "~S"))
(let ((*print-level* nil)
(*print-length* nil))
(let ((header (header d))
(rules-list (rules d)))
(format stream control-string (cons header rules-list))))
d)
;; this iterater use append for save "reversed normal order" made by above.
;; if you optimize this application, please think about ordering sequence.
;; i think, they are equal that <use reverse before pass to this> and <use append here>.
(defun map-separated-two-groups (bag fn &optional passed-1 passed-2)
(if (null bag)
(funcall fn passed-1 passed-2)
(progn (map-separated-two-groups (cdr bag) fn (append passed-1 (list (car bag))) passed-2)
(map-separated-two-groups (cdr bag) fn passed-1 (append passed-2 (list (car bag)))))))
;; atom-rule == (<label-string> . <category-value>)
;; rule == (<atom-rule> {<atom-rule>}*)
(eval-when (:compile-toplevel :load-toplevel :execute)
(proclaim '(inline make-rule finalize-rule)))
(defun finalize-rule (atom-rule) ;; optional -- for visualize
(format nil "~A=~A" (car atom-rule) (cdr atom-rule)))
(defun finalize-rules (rule)
(mapcar #'(lambda (x) (finalize-rule x)) rule))
(defun make-rule (conc pre sup conf lif conv)
(make-array 6 :initial-contents (list (finalize-rules pre)
(finalize-rules conc)
sup conf lif conv)))
(defun rule-indexes (conc pre rule rule-occur total)
(let ((conc-count (gethash conc rule-occur))
(rule-count (gethash rule rule-occur))
(pre-count (gethash pre rule-occur)))
(values (* (/ rule-count total) 100.0) ;; support
(* (/ rule-count pre-count) 100.0) ;; confident
(float (/ (/ rule-count pre-count) (/ conc-count total))) ;; lift
(let ((negative (- pre-count rule-count)))
(if (zerop negative)
most-positive-single-float
(float (/ (* pre-count (- 1 (/ conc-count total))) (- pre-count rule-count))))) ;; conviction
)))
(defun apriori-itemset-counting (transactions item-order support rule-length)
(let* ((itemset-hash (make-hash-table :test #'equal))
(total-transaction (hash-table-count transactions))
(count-threshold (max 1 (* total-transaction (/ support 100)))))
(flet ((prune-itemset-hash (itemsets)
(loop for itemset in itemsets
as num = (gethash itemset itemset-hash)
if (and (numberp num) (>= num count-threshold))
collect itemset
else
do (remhash itemset itemset-hash))))
;; first step, generate 1-itemsets
(loop for trans being the hash-value in transactions
do (loop for item in trans
as itemset = (cons item nil)
do (incf (gethash itemset itemset-hash 0))))
;; frequent itemset generation loop
(loop for k from 2 to rule-length
as itemsets =
(if (= k 2)
(gen-next-itemsets
(sort
(loop for itemset being the hash-key in itemset-hash
for num being the hash-value in itemset-hash
if (>= num count-threshold)
collect itemset
else
do (remhash itemset itemset-hash))
#'< :key #'(lambda (itemset)
(gethash (car itemset) item-order))))
(gen-next-itemsets itemsets))
do
(loop for trans being the hash-value in transactions
do (loop for itemset in itemsets
when (find-in-sorted-list itemset trans)
do (incf (gethash itemset itemset-hash 0))))
(setq itemsets (prune-itemset-hash itemsets))
finally (return itemset-hash)))))
(defun find-in-sorted-list (sorted-items sorted-list &key (test #'equal))
(loop for item in sorted-items
as search-result = (member item sorted-list :test test)
if search-result
do (setq sorted-list (cdr search-result))
else return nil
finally (return t)))
(defun match-except-tail (list1 list2 &key (test #'eql))
(loop for sub1 on list1
for sub2 on list2
while (and (cdr sub1) (cdr sub2))
unless (funcall test (car sub1) (car sub2))
do (return)
finally (return t)))
(defun gen-next-itemsets (pre-itemsets)
(let (next-itemsets)
(do ((itemsets pre-itemsets (cdr itemsets)))
((null (cdr itemsets)) (nreverse next-itemsets))
(loop for itemset in (cdr itemsets)
as last-i = (car (last itemset))
with target = (car itemsets)
with last-t = (car (last target))
when (and (not (equal last-i last-t))
(match-except-tail itemset target :test #'equal))
do (push `(,@target ,last-i) next-itemsets)))))
;; fcn. for obtaining the transactions and the order of items
(defun scan-input-data (unsp-dataset target-variables key-variable)
(let ((transactions (make-hash-table :test #'equal))
(item-order (make-hash-table :test #'equal))
(order 0)
(targets (choice-dimensions
target-variables unsp-dataset))
(keys (choice-a-dimension key-variable unsp-dataset)))
(do-vecs ((target targets)
(k keys))
(loop for x across target
for l in target-variables
for item = (cons l x) do
(unless (gethash item item-order)
(setf (gethash item item-order) (incf order)))
(pushnew item (gethash k transactions nil) :test #'equal)))
(maphash #'(lambda (key items)
(setf (gethash key transactions)
(sort items #'< :key #'(lambda (item)
(gethash item item-order)))))
transactions)
(values transactions item-order)))
(defun %association-analyze (unsp-dataset target-variables key-variable rule-length
&key (support 0) (confident 0) (lift 0) (conviction 0))
(assert (and (<= 0 support 100) (<= 0 confident 100) (<= 0 lift) (<= 0 conviction)))
(assert (and (integerp rule-length) (<= 2 rule-length)))
(multiple-value-bind (transactions item-order)
(scan-input-data unsp-dataset target-variables key-variable)
(let ((rule-occur (apriori-itemset-counting
transactions item-order support rule-length)))
(loop with ans = nil
with count = (hash-table-count transactions)
for rule being the hash-key in rule-occur do
(map-separated-two-groups
rule
#'(lambda (conc pre)
(when (and conc pre)
(multiple-value-bind (sup conf lif conv) (rule-indexes conc pre rule
rule-occur count)
(when (and (>= sup support) (>= conf confident) (>= lif lift) (>= conv conviction))
(push (make-rule conc pre
sup conf lif conv) ans))))))
finally (return (make-assoc-result ans support confident
lift conviction rule-length))))))
;; ap-genrule
;; pass fn such that push rule into some variable to this ap-maprule
;; count-lookup-fn: lookup itemset count
(defun ap-maprule (fn parent-itemset itemset-length count-lookup-fn max-precount
&optional (set-of-itemsets (mapcar #'(lambda (x) (list x)) parent-itemset))
(set-of-itemsets-length 1))
(dolist (conc set-of-itemsets)
(let ((pre (ordered-set-difference parent-itemset conc)))
(if (<= (funcall count-lookup-fn pre) max-precount)
(funcall fn conc pre)
(setf set-of-itemsets (delete conc set-of-itemsets)))))
(when (and (> itemset-length (1 set-of-itemsets-length)) set-of-itemsets)
(ap-maprule fn parent-itemset itemset-length count-lookup-fn max-precount
(gen-next-itemsets set-of-itemsets) (1 set-of-itemsets-length))))
(defun ordered-set-difference (sorted-item1 sorted-item2 &key (test #'equal))
(loop for item in sorted-item1
as search-result = (member item sorted-item2 :test test)
unless search-result
collect item))
;; if confident == 0, then max-count is most-positive-fixnum
(defun confident->max-precount (rule-count confident)
(if (zerop confident)
most-positive-fixnum
(* (/ rule-count confident) 100.0)))
(defun %association-analyze-ap-genrule (unsp-dataset target-variables key-variable rule-length
&key (support 0) (confident 0) (lift 0) (conviction 0))
(assert (and (<= 0 support 100) (<= 0 confident 100) (<= 0 lift) (<= 0 conviction)))
(assert (and (integerp rule-length) (<= 2 rule-length)))
(multiple-value-bind (transactions item-order)
(scan-input-data unsp-dataset target-variables key-variable)
(let ((rule-occur (apriori-itemset-counting
transactions item-order support rule-length)))
(let ((ans nil)
(count (hash-table-count transactions)))
(maphash #'(lambda (rule rule-count)
(let ((rule-length (length rule)))
(when (> rule-length 1)
(ap-maprule
#'(lambda (conc pre)
(multiple-value-bind (sup conf lif conv) (rule-indexes conc pre rule
rule-occur count)
(when (and (>= sup support) (>= conf confident) (>= lif lift) (>= conv conviction))
(push (make-rule conc pre
sup conf lif conv) ans))))
rule (length rule)
#'(lambda (itemset) (gethash itemset rule-occur))
(confident->max-precount rule-count confident)))))
rule-occur)
(make-assoc-result ans support confident
lift conviction rule-length)))))
(defun gen-next-itemset-trie (pre-itemsets)
(let ((next-trie (cons nil nil)))
(do ((itemsets pre-itemsets (cdr itemsets)))
((null (cdr itemsets)) next-trie)
(loop for itemset in (cdr itemsets)
as last-i = (car (last itemset))
with target = (car itemsets)
with last-t = (car (last target))
when (and (not (equal last-i last-t))
(match-except-tail itemset target :test #'equal))
do (assign-trie target last-i next-trie)))))
;; apriori counting trie has leaf at only last-i
(defun assign-trie (target last-i root)
(loop for i in target do
(setf root (let ((found (find
i (cdr root) :key #'car :test #'equal)))
(if found
found
(let ((new (cons i nil)))
(push new (cdr root))
new)))))
;; never found last-i in last-leaf
(push (cons last-i 0) (cdr root)))
(defun update-trie-count-apriori (trie transaction)
(let ((remain (member (car trie) transaction :test #'equal)))
(when remain
(if (consp (cdr trie))
(loop for branch in (cdr trie) do
(update-trie-count-apriori branch remain))
(incf (cdr trie))))))
;; push itemset into some variable by fn
;; reversed twice (1: building counting-trie, 2: accumrating by fn)
;; so order of itemsets is protected as a result.
(defun dump-itemset-hash (trie itemset-hash minimum-count fn
&optional (passed nil))
(if (consp (cdr trie))
(loop for branch in (cdr trie) do
(dump-itemset-hash branch itemset-hash minimum-count fn
(cons (car trie) passed)))
(let ((count (cdr trie)))
(when (>= count minimum-count)
(let ((new-itemset (reverse (cons (car trie) passed))))
(funcall fn new-itemset)
(setf (gethash new-itemset itemset-hash) count))))))
(defun apriori-itemset-counting-trie (transactions item-order support rule-length)
(let* ((itemset-hash (make-hash-table :test #'equal))
(total-transaction (hash-table-count transactions))
(count-threshold (max 1 (* total-transaction (/ support 100)))))
;; first setp, generate 1-itemsets
(loop for trans being the hash-value in transactions
do (loop for item in trans
as itemset = (list item)
do (incf (gethash itemset itemset-hash 0))))
;; frequent itemset generation loop
(let ((itemsets (sort
(loop for itemset being the hash-key in itemset-hash
for num being the hash-value in itemset-hash
if (>= num count-threshold)
collect itemset
else
do (remhash itemset itemset-hash))
#'< :key #'(lambda (itemset)
(gethash (car itemset) item-order)))))
(loop for k from 2 to rule-length
as counting-trie = (gen-next-itemset-trie itemsets)
do
(loop for trans being the hash-value in transactions do
(loop for branch in (cdr counting-trie) do
(update-trie-count-apriori branch trans)))
(setf itemsets nil)
(loop for branch in (cdr counting-trie) do
(dump-itemset-hash branch itemset-hash count-threshold
#'(lambda (itemset)
(push itemset itemsets))))
finally (return itemset-hash)))))
(defun %association-analyze-apriori (unsp-dataset target-variables key-variable rule-length
&key (support 0) (confident 0) (lift 0) (conviction 0))
(assert (and (<= 0 support 100) (<= 0 confident 100) (<= 0 lift) (<= 0 conviction)))
(assert (and (integerp rule-length) (<= 2 rule-length)))
(multiple-value-bind (transactions item-order)
(scan-input-data unsp-dataset target-variables key-variable)
(let ((rule-occur (apriori-itemset-counting-trie
transactions item-order support rule-length)))
(let ((ans nil)
(count (hash-table-count transactions)))
(maphash #'(lambda (rule rule-count)
(let ((rule-length (length rule)))
(when (> rule-length 1)
(ap-maprule
#'(lambda (conc pre)
(multiple-value-bind (sup conf lif conv) (rule-indexes conc pre rule
rule-occur count)
(when (and (>= sup support) (>= conf confident) (>= lif lift) (>= conv conviction))
(push (make-rule conc pre
sup conf lif conv) ans))))
rule (length rule)
#'(lambda (itemset) (gethash itemset rule-occur))
(confident->max-precount rule-count confident)))))
rule-occur)
(make-assoc-result ans support confident
lift conviction rule-length)))))
;; interface
(defun association-analyze (infile outfile target-variables key-variable rule-length
&key (support 0) (confident 0) (lift 0) (conviction 0) (external-format :default)
(file-type :sexp) (csv-type-spec '(string double-float))
(algorithm :lcm))
(assert (member algorithm `(:apriori :da :fp-growth :eclat :lcm)))
(let ((assoc-result
(case algorithm
(:apriori
(%association-analyze-apriori
(read-data-from-file infile :external-format external-format
:type file-type :csv-type-spec csv-type-spec)
target-variables key-variable rule-length
:support support :confident confident :lift lift :conviction conviction))
(:da
(%association-analyze-da-ap-genrule
(read-data-from-file infile :external-format external-format
:type file-type :csv-type-spec csv-type-spec)
target-variables key-variable rule-length
:support support :confident confident :lift lift :conviction conviction))
(:fp-growth
(%association-analyze-fp-growth
(read-data-from-file infile :external-format external-format
:type file-type :csv-type-spec csv-type-spec)
target-variables key-variable rule-length
:support support :confident confident :lift lift :conviction conviction))
(:eclat
(%association-analyze-eclat
(read-data-from-file infile :external-format external-format
:type file-type :csv-type-spec csv-type-spec)
target-variables key-variable rule-length
:support support :confident confident :lift lift :conviction conviction))
(:lcm
(%association-analyze-lcm
(read-data-from-file infile :external-format external-format
:type file-type :csv-type-spec csv-type-spec)
target-variables key-variable rule-length
:support support :confident confident :lift lift :conviction conviction)))))
(with-open-file (stream outfile :direction :output :if-exists :supersede
:external-format external-format)
(with-standard-io-syntax
(let ((*read-default-float-format* 'double-float))
(assoc-data-out assoc-result stream))))))