Skip to content

Commit

Permalink
Json pointer (simdjson#220)
Browse files Browse the repository at this point in the history
* json pointer support

* Addition of tests for the json pointer

* Adding a new tool for the JSON Pointer support, and some documentation.
  • Loading branch information
ioioioio authored and lemire committed Jul 26, 2019
1 parent cb44b3b commit bcabdfc
Show file tree
Hide file tree
Showing 8 changed files with 381 additions and 7 deletions.
15 changes: 11 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 57,8 @@ endif # ifeq ($(DEBUG),1)
endif # ifeq ($(SANITIZE),1)
endif # ifeq ($(MEMSANITIZE),1)

MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck
MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel jsonpointer
TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing

Expand Down Expand Up @@ -91,20 91,22 @@ benchmark:
bash ./scripts/parser.sh
bash ./scripts/parseandstat.sh

test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json
test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
./basictests
./numberparsingcheck
./stringparsingcheck
./jsoncheck
./pointercheck
./scripts/testjson2json.sh
./scripts/issue150.sh
@echo "It looks like the code is good!"

quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json
quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
./basictests
./numberparsingcheck
./stringparsingcheck
./jsoncheck
./pointercheck
./scripts/testjson2json.sh
./scripts/issue150.sh

Expand Down Expand Up @@ -149,6 151,8 @@ numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp -I. $(LIBFLAGS) -DJSON_TEST_STRINGS

pointercheck:tests/pointercheck.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o pointercheck tests/pointercheck.cpp src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/parsedjson.cpp src/parsedjsoniterator.cpp -I. $(LIBFLAGS)

minifiercompetition: benchmark/minifiercompetition.cpp $(HEADERS) submodules $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIBFILES)
$(CXX) $(CXXFLAGS) -o minifiercompetition $(LIBFILES) $(MINIFIERLIBFILES) benchmark/minifiercompetition.cpp -I. $(LIBFLAGS) $(COREDEPSINCLUDE)
Expand All @@ -159,6 163,9 @@ minify: tools/minify.cpp $(HEADERS) $(MINIFIERHEADERS) $(LIBFILES) $(MINIFIERLIB
json2json: tools/json2json.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o json2json $ tools/json2json.cpp $(LIBFILES) -I.

jsonpointer: tools/jsonpointer.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsonpointer $ tools/jsonpointer.cpp $(LIBFILES) -I.

jsonstats: tools/jsonstats.cpp $(HEADERS) $(LIBFILES)
$(CXX) $(CXXFLAGS) -o jsonstats $ tools/jsonstats.cpp $(LIBFILES) -I.

Expand Down
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 67,7 @@ Under Windows, we build some tools using the windows/dirent_portable.h file (whi

## Code usage and example

The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').
The main API involves populating a `ParsedJson` object which hosts a fully navigable document-object-model (DOM) view of the JSON document. The DOM can be accessed using [JSON Pointer](https://tools.ietf.org/html/rfc6901) paths, for example. The main function is `json_parse` which takes a string containing the JSON document as well as a reference to pre-allocated `ParsedJson` object (which can be reused multiple time). Once you have populated the `ParsedJson` object you can navigate through the DOM with an iterator (e.g., created by `ParsedJson::iterator pjh(pj)`, see 'Navigating the parsed document').

```C
#include "simdjson/jsonparser.h"
Expand Down Expand Up @@ -313,6 313,7 @@ If you find the version of `simdjson` shipped with `vcpkg` is out-of-date, feel
- `json2json mydoc.json` parses the document, constructs a model and then dumps back the result to standard output.
- `json2json -d mydoc.json` parses the document, constructs a model and then dumps model (as a tape) to standard output. The tape format is described in the accompanying file `tape.md`.
- `minify mydoc.json` minifies the JSON document, outputting the result to standard output. Minifying means to remove the unneeded white space characters.
- `jsonpointer mydoc.json <jsonpath> <jsonpath> ... <jsonpath>` parses the document, constructs a model and then processes a series of [JSON Pointer paths](https://tools.ietf.org/html/rfc6901). The result is itself a JSON document.
## Scope
Expand Down Expand Up @@ -347,6 348,20 @@ The parser works in two stages:
- Stage 1. (Find marks) Identifies quickly structure elements, strings, and so forth. We validate UTF-8 encoding at that stage.
- Stage 2. (Structure building) Involves constructing a "tree" of sort (materialized as a tape) to navigate through the data. Strings and numbers are parsed at this stage.
## JSON Pointer
We can navigate the parsed JSON using JSON Pointers as per the [RFC6901 standard](https://tools.ietf.org/html/rfc6901).
You can build a tool (jsonpointer) to parse a JSON document and then issue an array of JSON Pointer queries:
```
make jsonpointer
./jsonpointer jsonexamples/small/demo.json /Image/Width /Image/Height /Image/IDs/2
./jsonpointer jsonexamples/twitter.json /statuses/0/id /statuses/1/id /statuses/2/id /statuses/3/id /statuses/4/id /statuses/5/id
```
In C , given a `ParsedJson`, we can move to a node with the `move_to` method, passing a `std::string` representing the JSON Pointer query.
## Navigating the parsed document
Here is a code sample to dump back the parsed JSON to a string:
Expand Down
63 changes: 62 additions & 1 deletion include/simdjson/parsedjson.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 231,47 @@ struct ParsedJson {
// this is equivalent but much faster than calling "next()".
inline void move_to_value();

// when at [, go one level deep, and advance to the given index.
// if successful, we are left pointing at the value,
// if not, we are still pointing at the array ([)
inline bool move_to_index(uint32_t index);

// Moves the iterator to the value correspoding to the json pointer.
// Always search from the root of the document.
// if successful, we are left pointing at the value,
// if not, we are still pointing the same value we were pointing before the call.
// The json pointer follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
// However, the standard says "If a referenced member name is not unique in an object,
// the member that is referenced is undefined, and evaluation fails".
// Here we just return the first corresponding value.
// The length parameter is the length of the jsonpointer string ('pointer').
bool move_to(const char * pointer, uint32_t length);

// Moves the iterator to the value correspoding to the json pointer.
// Always search from the root of the document.
// if successful, we are left pointing at the value,
// if not, we are still pointing the same value we were pointing before the call.
// The json pointer implementation follows the rfc6901 standard's syntax: https://tools.ietf.org/html/rfc6901
// However, the standard says "If a referenced member name is not unique in an object,
// the member that is referenced is undefined, and evaluation fails".
// Here we just return the first corresponding value.
inline bool move_to(const std::string & pointer) {
return move_to(pointer.c_str(), pointer.length());
}



private:

// Almost the same as move_to(), except it searchs from the current position.
// The pointer's syntax is identical, though that case is not handled by the rfc6901 standard.
// The '/' is still required at the beginning.
// However, contrary to move_to(), the URI Fragment Identifier Representation is not supported here.
// Also, in case of failure, we are left pointing at the closest value it could reach.
// For these reasons it is private. It exists because it is used by move_to().
bool relative_move_to(const char * pointer, uint32_t length);
public:

// throughout return true if we can do the navigation, false
// otherwise

Expand Down Expand Up @@ -264,6 305,10 @@ struct ParsedJson {
// a scope is a series of nodes at the same level
inline void to_start_scope();

inline void rewind() {
while(up());
}

// void to_end_scope(); // move us to
// the start of our current scope; always succeeds

Expand Down Expand Up @@ -419,8 464,24 @@ bool ParsedJson::iterator::move_to_key(const char * key, uint32_t length) {
return false;
}

bool ParsedJson::iterator::move_to_index(uint32_t index) {
assert(is_array());
if (down()) {
uint32_t i = 0;
for (; i < index; i ) {
if (!next()) {
break;
}
}
if (i == index) {
return true;
}
assert(up());
}
return false;
}

bool ParsedJson::iterator::prev() {
bool ParsedJson::iterator::prev() {
if(location - 1 < depthindex[depth].start_of_scope) {
return false;
}
Expand Down
2 changes: 1 addition & 1 deletion jsonexamples/twitter.json
Original file line number Diff line number Diff line change
Expand Up @@ -15479,4 15479,4 @@
"since_id": 0,
"since_id_str": "0"
}
}
}
168 changes: 168 additions & 0 deletions src/parsedjsoniterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 93,172 @@ bool ParsedJson::iterator::print(std::ostream &os, bool escape_strings) const {
}
return true;
}

bool ParsedJson::iterator::move_to(const char * pointer, uint32_t length) {
char* new_pointer = nullptr;
if (pointer[0] == '#') {
// Converting fragment representation to string representation
new_pointer = new char[length];
uint32_t new_length = 0;
for (uint32_t i = 1; i < length; i ) {
if (pointer[i] == '%' && pointer[i 1] == 'x') {
try {
int fragment = std::stoi(std::string(&pointer[i 2], 2), nullptr, 16);
if (fragment == '\\' || fragment == '"' || (fragment <= 0x1F)) {
// escaping the character
new_pointer[new_length] = '\\';
new_length ;
}
new_pointer[new_length] = fragment;
i = 3;
}
catch(std::invalid_argument& e) {
delete[] new_pointer;
return false; // the fragment is invalid
}
}
else {
new_pointer[new_length] = pointer[i];
}
new_length ;
}
length = new_length;
pointer = new_pointer;
}

// saving the current state
size_t depth_s = depth;
size_t location_s = location;
uint8_t current_type_s = current_type;
uint64_t current_val_s = current_val;
scopeindex_t *depthindex_s = depthindex;

rewind(); // The json pointer is used from the root of the document.

bool found = relative_move_to(pointer, length);
delete[] new_pointer;

if (!found) {
// since the pointer has found nothing, we get back to the original position.
depth = depth_s;
location = location_s;
current_type = current_type_s;
current_val = current_val_s;
depthindex = depthindex_s;
}

return found;
}

bool ParsedJson::iterator::relative_move_to(const char * pointer, uint32_t length) {
if (length == 0) {
// returns the whole document
return true;
}

if (pointer[0] != '/') {
// '/' must be the first character
return false;
}

// finding the key in an object or the index in an array
std::string key_or_index;
uint32_t offset = 1;

// checking for the "-" case
if (is_array() && pointer[1] == '-') {
if (length != 2) {
// the pointer must be exactly "/-"
// there can't be anything more after '-' as an index
return false;
}
key_or_index = '-';
offset = length; // will skip the loop coming right after
}

// We either transform the first reference token to a valid json key
// or we make sure it is a valid index in an array.
for (; offset < length ; offset ) {
if (pointer[offset] == '/') {
// beginning of the next key or index
break;
}
if (is_array() && (pointer[offset] < '0' || pointer[offset] > '9')) {
// the index of an array must be an integer
// we also make sure std::stoi won't discard whitespaces later
return false;
}
if (pointer[offset] == '~') {
// "~1" represents "/"
if (pointer[offset 1] == '1') {
key_or_index = '/';
offset ;
continue;
}
// "~0" represents "~"
if (pointer[offset 1] == '0') {
key_or_index = '~';
offset ;
continue;
}
}
if (pointer[offset] == '\\') {
if (pointer[offset 1] == '\\' || pointer[offset 1] == '"' || (pointer[offset 1] <= 0x1F)) {
key_or_index = pointer[offset 1];
offset ;
continue;
}
return false; // invalid escaped character
}
if (pointer[offset] == '\"') {
// unescaped quote character. this is an invalid case.
// lets do nothing and assume most pointers will be valid.
// it won't find any corresponding json key anyway.
// return false;
}
key_or_index = pointer[offset];
}

bool found = false;
if (is_object()) {
if (move_to_key(key_or_index.c_str(), key_or_index.length())) {
found = relative_move_to(pointer offset, length-offset);
}
}
else if(is_array()) {
if (key_or_index == "-") { // handling "-" case first
if (down()) {
while(next()); // moving to the end of the array
// moving to the nonexistent value right after...
size_t npos;
if ((current_type == '[') || (current_type == '{')) {
// we need to jump
npos = ( current_val & JSONVALUEMASK);
} else {
npos = location ((current_type == 'd' || current_type == 'l') ? 2 : 1);
}
location = npos;
current_val = pj.tape[npos];
current_type = (current_val >> 56);
return true; // how could it fail ?
}
} else { // regular numeric index
// The index can't have a leading '0'
if (key_or_index[0] == '0' && key_or_index.length() > 1) {
return false;
}
// it cannot be empty
if (key_or_index.length() == 0) {
return false;
}
// we already checked the index contains only valid digits
uint32_t index = std::stoi(key_or_index);
if (move_to_index(index)) {
found = relative_move_to(pointer offset, length-offset);
}
}
}

return found;
}
}
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 6,7 @@ endif()

add_cpp_test(basictests)
add_cpp_test(jsoncheck)
add_cpp_test(pointercheck)

## This causes problems
# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
Expand Down
Loading

0 comments on commit bcabdfc

Please sign in to comment.