-
Notifications
You must be signed in to change notification settings - Fork 7
/
main.go
125 lines (105 loc) · 3.03 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package main
import (
"bytes"
"flag"
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
// keeps track of files stored in the index so we can open them to find matches
var idToFile []string
var trigramMethod = ""
func main() {
flag.StringVar(&trigramMethod, "trigram", "", "which trigram method should we use [default,merovius,dancantos,jamesrom,ffmiruz]")
flag.Parse()
startTime := time.Now().UnixMilli()
// walk the directory getting files and indexing
_ = filepath.Walk(".", func(root string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.IsDir() {
return nil // we only care about files
}
res, err := os.ReadFile(root)
if err != nil {
return nil // swallow error
}
// don't index binary files by looking for nul byte, similar to how grep does it
if bytes.IndexByte(res, 0) != -1 {
return nil
}
// only index up to about 5kb
if len(res) > 5000 {
res = res[:5000]
}
// add the document to the index
_ = Add(Itemise(Tokenize(string(res))))
// store the association from what's in the index to the filename, we know its 0 to whatever so this works
idToFile = append(idToFile, root)
return nil
})
endTime := time.Now().UnixMilli() - startTime
fmt.Printf("currentBlockDocumentCount:%v currentDocumentCount:%v currentBlockStartDocumentCount:%v indexTimeMilli:%v trigramMethod:%v\n", currentBlockDocumentCount, currentDocumentCount, currentBlockStartDocumentCount, endTime, trigramMethod)
if trigramMethod == "" {
return
}
var searchTerm string
for {
fmt.Println("enter search term: ")
_, _ = fmt.Scanln(&searchTerm)
res := Search(Queryise(searchTerm))
fmt.Println("--------------")
fmt.Println(len(res), "index result(s)")
fmt.Println("")
for _, r := range res {
fmt.Println(idToFile[r])
matching := findMatchingLines(idToFile[r], searchTerm, 5)
for _, l := range matching {
fmt.Println(l)
}
if len(matching) == 0 {
fmt.Println("false positive match")
}
fmt.Println("")
}
}
}
// Given a file and a query try to open the file, then look through its lines
// and see if any of them match something from the query up to a limit
// Note this will return partial matches as if any term matches its considered a match
// and there is no accounting for better matches...
// In other words it's a very dumb way of doing this and probably has horrible runtime
// performance to match
func findMatchingLines(filename string, query string, limit int) []string {
res, err := os.ReadFile(filename)
if err != nil {
return nil
}
terms := strings.Fields(strings.ToLower(query))
var cleanTerms []string
for _, t := range terms {
if len(t) >= 3 {
cleanTerms = append(cleanTerms, t)
}
}
var matches []string
for i, l := range strings.Split(string(res), "\n") {
low := strings.ToLower(l)
found := false
for _, t := range terms {
if strings.Contains(low, t) {
if !found {
matches = append(matches, fmt.Sprintf("%v. %v", i 1, l))
}
found = true
}
}
if len(matches) >= limit {
return matches
}
}
return matches
}