-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.js
265 lines (226 loc) · 9.97 KB
/
parser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
// import PDFExtract module and instantiate an object to access the API
const PDFExtract = require("pdf.js-extract").PDFExtract;
const pdfExtract = new PDFExtract();
// import filesystem module
const fs = require("fs");
const INPUT_DIR = "./input_data";
/**
* Parse the crime report PDFs and write to a JSON when this module is executed.
*/
(async function processData() {
// array of all the input files (crime reports)
const inputFiles = fs.readdirSync(INPUT_DIR);
// go through all the input files and produce an output JSON file
for (const filename of inputFiles)
{
// get the file name from the file path
const filepath = `${INPUT_DIR}/${filename}`;
// pull the data from the PDF
const data = await getDataFromPDF(filepath);
// write the data from the PDF to a JSON file
fs.writeFileSync(`output/${filename.replace("pdf", "json")}`, JSON.stringify(data, null, 4));
}
})();
/**
* Extract the PDF contents of a file.
* @param {String} filename
*/
async function getDataFromPDF(filename) {
// extract PDF contents
const data = await pdfExtract.extract(filename, {});
// go through every page in the PDF and reconstruct lines
const incidentEntries = [];
for (const page of data.pages) {
// get the lines of the PDF (lines is an array of line items arrays) by grouping by y-coordinate
const sortedRawLines = Object.values(fuzzyGroupByYPos(page.content))
// order lines by y-coordinate in ascending order
.sort((a, b) => a[0].y - b[0].y)
// order items within lines by x-coordinate in ascending order
.map(x => x.sort((a, b) => a.x - b.x));
// put together line strings
const reconstructedLines = sortedRawLines
.map(line => line.reduce((a, b) => a + b.str, ""))
// chop off the header lines that aren't case data
.slice(4);
// remove the last line which lists the incident count
let state, entry = {
incident: ""
};
for (let i = 0; i < reconstructedLines.length; i++) {
const line = reconstructedLines[i];
// get the state depending on what the line starts with
if (line.startsWith("Date Reported")) state = "reportDate";
else if (line.startsWith("General Location")) state = "location";
else if (line.startsWith("Date Occurred From")) state = "timeStart";
else if (line.startsWith("Date Occurred To")) state = "timeEnd";
else if (line.startsWith("Incident/Offenses")) state = "incident";
else if (line.startsWith("Disposition")) state = "disposition";
else if (line.startsWith("Modified Date")) state = "dateModified";
// skip processing the summary line at the end of the report
else if (line.indexOf("incident(s) listed") != -1) break;
// process a "Report Date:" line
if (state == "reportDate")
{
entry.reportDate = parseDateLine(line.replace("Date Reported:", ""));
}
// process a "General Location:" line
else if (state == "location")
{
let locationLine = line;
locationLine = locationLine.replace("General Location:", "");
const tokens = locationLine.split(" - ");
entry.location = tokens[0];
entry.onCampus = tokens.includes("On Campus");
}
// process a "Date Occurred From:" line
else if (state == "timeStart")
{
entry.timeStart = parseDateLine(line.replace("Date Occurred From:", ""));
}
// process a "Date Occurred To:" line
else if (state == "timeEnd")
{
entry.timeEnd = parseDateLine(line.replace("Date Occurred To:", ""));
}
// process an "Incident/Offenses:" line
else if (state == "incident")
{
// append to incident line in case it's multiple lines
entry.incident += line.replace("Incident/Offenses:", "");
}
// process a "Disposition:" line
else if (state == "disposition")
{
// once we are in this state, we know the incident line is complete so we can process it
const incidentClass = entry.incident.substring(0, entry.incident.indexOf("-") - 1);
entry.incidentClass = properCapitalize(incidentClass);
// get rid of the incident class from the incident property
entry.incident = entry.incident.replace(`${incidentClass} - `, "");
// remove Clery Act stipulation from the incident and record as separate property
if (entry.incident.includes("Clery Stat Only"))
{
entry.incident = entry.incident.replace(`Clery Stat Only`, "");
entry.cleryStatOnly = true;
}
else
{
entry.cleryStatOnly = false;
}
// get last instance of "//" to get the most specific sub class of the incident
const subcategories = entry.incident.split(" // ");
if (subcategories.length > 1)
{
// get the most specific subclass at the last catgory that follows a "//" separator
const mostSpecificSubClass = subcategories[subcategories.length - 1];
// clean off the subclass for an actual description
const incidentSubClass = mostSpecificSubClass.substring(mostSpecificSubClass.indexOf("-") + 1);
// trim non-alphabetic characters from the sub class string
entry.incidentSubClass = specialTrim(incidentSubClass);
// chop off the subclass from the incident property
entry.incident = entry.incident.substring(0, entry.incident.indexOf(" //"));
}
// TODO categorize "()" as sub classes of incident too?
// replace PAT or INV disposition codes and convert to proper capitalization
entry.disposition = properCapitalize(
line
.replace("Disposition:PAT-", "")
.replace("Disposition:inv-", "")
.replace("Disposition:INV-", "")
);
}
// process a "Modified Date:" line
else if (state == "dateModified")
{
entry.dateModified = parseDateLine(line.replace("Modified Date:", ""));
// add a deep copy of the entry object to the incident entries array
incidentEntries.push(JSON.parse(JSON.stringify(entry)));
// reset the entry object to start parsing a new incident
entry = {
incident: ""
}
}
}
}
// return the array of incident entries since we're done parsing the page
return incidentEntries;
}
/**
* Trim whitespace and extra non-alphabetic characters.
*/
function specialTrim(str) {
const alphabet = "abcdefghijklmnopqrstuvwxyz";
// trim non-alphabetic characters from the front
while (!alphabet.includes(str[0].toLowerCase()))
{
str = str.substr(1);
}
// trim non-alphabetic characters from the back
while (!alphabet.includes(str[str.length - 1].toLowerCase()))
{
str = str.substr(0, str.length - 1);
}
return str;
}
/**
*
* @param {String} str
* @return {String} properCapitalizedStr
*/
function properCapitalize(str) {
return str.split(" ")
.map(x => x.substring(0, 1).toUpperCase() + x.substring(1).toLowerCase())
.join(" ");
}
/**
* Convert the weird date format from the report into a date in milliseconds.
* @param {String} line
* @return {Number} date
*/
function parseDateLine(line) {
// get the date
const date = line.substring(
// the start of the string after cleaning is the start of the date
0,
// target before the first space character as the end of the date
line.indexOf(" ")
);
// delete the date from the line
line = line.replace(`${date} `, "");
// get the time
const time = line.substring(
// target the at after the week day abbreviation before the time as the start
line.indexOf(" at") + 4,
// target the Report text after the time as the end (only applies to certain lines)
line.indexOf("Report") == -1 ? line.length : line.indexOf("Report")
);
return new Date(`${date} ${time}`).getTime();;
}
/**
* Group an array of items into lines by Y-positions with some tolerance for improper line
* alignments.
* @param {Object[]} items
* @param {Number} tolerance
* @return {Object} groupedLines
*/
function fuzzyGroupByYPos(items, tolerance=0.3) {
return items.reduce((linesArray, item) => {
// get the closest previously recorded y-pos bucket
// from: https://stackoverflow.com/questions/8584902/get-the-closest-number-out-of-an-array
const closest = Object.keys(linesArray)
.reduce((prev, curr) => (Math.abs(curr - item.y) < Math.abs(prev - item.y) ? curr : prev), 0);
// calculate the difference between the closest line by Y-pos and the current line
const difference = Math.abs(closest - item.y);
// if the difference is close enough, it is the same line, just improperly aligned
if (difference < tolerance && difference !== 0)
{
linesArray[closest].push(item);
}
// otherwise it is a different line
else
{
// from: https://stackoverflow.com/questions/14446511/most-efficient-method-to-groupby-on-an-array-of-objects
(linesArray[item.y] = linesArray[item.y] || []).push(item);
}
return linesArray;
}, {});
}