crime-report-parser/parser.js at master · umbc-sga/crime-report-parser

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

// import PDFExtract module and instantiate an object to access the API

const PDFExtract = require("pdf.js-extract").PDFExtract;

const pdfExtract = new PDFExtract();

// import filesystem module

const fs = require("fs");

const INPUT_DIR = "./input_data";

/**

* Parse the crime report PDFs and write to a JSON when this module is executed.

*/

(async function processData() {

// array of all the input files (crime reports)

const inputFiles = fs.readdirSync(INPUT_DIR);

// go through all the input files and produce an output JSON file

for (const filename of inputFiles)

{

// get the file name from the file path

const filepath = `${INPUT_DIR}/${filename}`;

// pull the data from the PDF

const data = await getDataFromPDF(filepath);

// write the data from the PDF to a JSON file

fs.writeFileSync(`output/${filename.replace("pdf", "json")}`, JSON.stringify(data, null, 4));

}

})();

/**

* Extract the PDF contents of a file.

* @param {String} filename

*/

async function getDataFromPDF(filename) {

// extract PDF contents

const data = await pdfExtract.extract(filename, {});

// go through every page in the PDF and reconstruct lines

const incidentEntries = [];

for (const page of data.pages) {

// get the lines of the PDF (lines is an array of line items arrays) by grouping by y-coordinate

const sortedRawLines = Object.values(fuzzyGroupByYPos(page.content))

// order lines by y-coordinate in ascending order

.sort((a, b) => a[0].y - b[0].y)

// order items within lines by x-coordinate in ascending order

.map(x => x.sort((a, b) => a.x - b.x));

// put together line strings

const reconstructedLines = sortedRawLines

.map(line => line.reduce((a, b) => a + b.str, ""))

// chop off the header lines that aren't case data

.slice(4);

// remove the last line which lists the incident count

let state, entry = {

incident: ""

};

for (let i = 0; i < reconstructedLines.length; i++) {

const line = reconstructedLines[i];

// get the state depending on what the line starts with

if (line.startsWith("Date Reported")) state = "reportDate";

else if (line.startsWith("General Location")) state = "location";

else if (line.startsWith("Date Occurred From")) state = "timeStart";

else if (line.startsWith("Date Occurred To")) state = "timeEnd";

else if (line.startsWith("Incident/Offenses")) state = "incident";

else if (line.startsWith("Disposition")) state = "disposition";

else if (line.startsWith("Modified Date")) state = "dateModified";

// skip processing the summary line at the end of the report

else if (line.indexOf("incident(s) listed") != -1) break;

// process a "Report Date:" line

if (state == "reportDate")

{

entry.reportDate = parseDateLine(line.replace("Date Reported:", ""));

}

// process a "General Location:" line

else if (state == "location")

{

let locationLine = line;

locationLine = locationLine.replace("General Location:", "");

const tokens = locationLine.split(" - ");

entry.location = tokens[0];

entry.onCampus = tokens.includes("On Campus");

}

// process a "Date Occurred From:" line

else if (state == "timeStart")

{

entry.timeStart = parseDateLine(line.replace("Date Occurred From:", ""));

}

// process a "Date Occurred To:" line

else if (state == "timeEnd")

{

entry.timeEnd = parseDateLine(line.replace("Date Occurred To:", ""));

}

// process an "Incident/Offenses:" line

else if (state == "incident")

{

// append to incident line in case it's multiple lines

entry.incident += line.replace("Incident/Offenses:", "");

}

// process a "Disposition:" line

else if (state == "disposition")

{

// once we are in this state, we know the incident line is complete so we can process it

const incidentClass = entry.incident.substring(0, entry.incident.indexOf("-") - 1);

entry.incidentClass = properCapitalize(incidentClass);

// get rid of the incident class from the incident property

entry.incident = entry.incident.replace(`${incidentClass} - `, "");

// remove Clery Act stipulation from the incident and record as separate property

if (entry.incident.includes("Clery Stat Only"))

{

entry.incident = entry.incident.replace(`Clery Stat Only`, "");

entry.cleryStatOnly = true;

}

else

{

entry.cleryStatOnly = false;

}

// get last instance of "//" to get the most specific sub class of the incident

const subcategories = entry.incident.split(" // ");

if (subcategories.length > 1)

{

// get the most specific subclass at the last catgory that follows a "//" separator

const mostSpecificSubClass = subcategories[subcategories.length - 1];

// clean off the subclass for an actual description

const incidentSubClass = mostSpecificSubClass.substring(mostSpecificSubClass.indexOf("-") + 1);

// trim non-alphabetic characters from the sub class string

entry.incidentSubClass = specialTrim(incidentSubClass);

// chop off the subclass from the incident property

entry.incident = entry.incident.substring(0, entry.incident.indexOf(" //"));

}

// TODO categorize "()" as sub classes of incident too?

// replace PAT or INV disposition codes and convert to proper capitalization

entry.disposition = properCapitalize(

line

.replace("Disposition:PAT-", "")

.replace("Disposition:inv-", "")

.replace("Disposition:INV-", "")

);

}

// process a "Modified Date:" line

else if (state == "dateModified")

{

entry.dateModified = parseDateLine(line.replace("Modified Date:", ""));

// add a deep copy of the entry object to the incident entries array

incidentEntries.push(JSON.parse(JSON.stringify(entry)));

// reset the entry object to start parsing a new incident

entry = {

incident: ""

}

// return the array of incident entries since we're done parsing the page

return incidentEntries;

}

/**

* Trim whitespace and extra non-alphabetic characters.

*/

function specialTrim(str) {

const alphabet = "abcdefghijklmnopqrstuvwxyz";

// trim non-alphabetic characters from the front

while (!alphabet.includes(str[0].toLowerCase()))

{

str = str.substr(1);

}

// trim non-alphabetic characters from the back

while (!alphabet.includes(str[str.length - 1].toLowerCase()))

{

str = str.substr(0, str.length - 1);

}

return str;

}

/**

*

* @param {String} str

* @return {String} properCapitalizedStr

*/

function properCapitalize(str) {

return str.split(" ")

.map(x => x.substring(0, 1).toUpperCase() + x.substring(1).toLowerCase())

.join(" ");

}

/**

* Convert the weird date format from the report into a date in milliseconds.

* @param {String} line

* @return {Number} date

*/

function parseDateLine(line) {

// get the date

const date = line.substring(

// the start of the string after cleaning is the start of the date

0,

// target before the first space character as the end of the date

line.indexOf(" ")

);

// delete the date from the line

line = line.replace(`${date} `, "");

// get the time

const time = line.substring(

// target the at after the week day abbreviation before the time as the start

line.indexOf(" at") + 4,

// target the Report text after the time as the end (only applies to certain lines)

line.indexOf("Report") == -1 ? line.length : line.indexOf("Report")

);

return new Date(`${date} ${time}`).getTime();;

}

/**

* Group an array of items into lines by Y-positions with some tolerance for improper line

* alignments.

* @param {Object[]} items

* @param {Number} tolerance

* @return {Object} groupedLines

*/

function fuzzyGroupByYPos(items, tolerance=0.3) {

return items.reduce((linesArray, item) => {

// get the closest previously recorded y-pos bucket

// from: https://stackoverflow.com/questions/8584902/get-the-closest-number-out-of-an-array

const closest = Object.keys(linesArray)

.reduce((prev, curr) => (Math.abs(curr - item.y) < Math.abs(prev - item.y) ? curr : prev), 0);

// calculate the difference between the closest line by Y-pos and the current line

const difference = Math.abs(closest - item.y);

// if the difference is close enough, it is the same line, just improperly aligned

if (difference < tolerance && difference !== 0)

{

linesArray[closest].push(item);

}

// otherwise it is a different line

else

{

// from: https://stackoverflow.com/questions/14446511/most-efficient-method-to-groupby-on-an-array-of-objects

(linesArray[item.y] = linesArray[item.y] || []).push(item);

}

return linesArray;

}, {});

}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

parser.js

parser.js

Files

parser.js

Latest commit

History

parser.js

File metadata and controls