json/parse.go (651 lines of code) (raw):

// Licensed to Elasticsearch B.V. under one or more contributor // license agreements. See the NOTICE file distributed with // this work for additional information regarding copyright // ownership. Elasticsearch B.V. licenses this file to you under // the Apache License, Version 2.0 (the "License"); you may // not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. package json import ( "bytes" "errors" "fmt" "io" "math" "strconv" "unicode" "unicode/utf16" "unicode/utf8" structform "github.com/elastic/go-structform" ) type Parser struct { visitor structform.Visitor strVisitor structform.StringRefVisitor // last fail state err error // parser state machine states []state // state stack for nested arrays/objects currentState state // preallocate stack memory for up to 32 nested arrays/objects statesBuf [32]state literalBuffer []byte // literalBuffer0 is used as a fast reset option for literalBuffer without allocation. literalBuffer0 [64]byte inEscape bool isDouble bool required int } var ( errFailing = errors.New("JSON parser failed") errIncomplete = errors.New("Incomplete JSON input") errUnknownChar = errors.New("unknown character") errQuoteMissing = errors.New("missing closing quote") errExpectColon = errors.New("expected ':' after map key") errUnexpectedDictClose = errors.New("unexpected '}'") errUnexpectedArrClose = errors.New("unexpected ']'") errExpectedDigit = errors.New("expected a digit") errExpectedObject = errors.New("expected JSON object") errExpectedArray = errors.New("expected JSON array") errExpectedFieldName = errors.New("expected JSON object field name") errExpectedInteger = errors.New("expected integer value") errExpectedNull = errors.New("expected null value") errExpectedFalse = errors.New("expected false value") errExpectedTrue = errors.New("expected true value") errExpectedArrayField = errors.New("expected ']' or ','") errUnquoteInEscape = errors.New("incomplete escape at end of string") errUnquoteInvalidChar = errors.New("invalid character found in string") errUnquoteInvalidUnicode = errors.New("unicode escape is no hex number") errUnquoteUnknownEscape = errors.New("unknown escape sequence") ) type state uint8 //go:generate stringer -type=state const ( failedState state = iota startState arrState arrStateValue arrStateNext dictState dictFieldState dictNextFieldState dictFieldValue dictFieldValueSep dictFieldStateEnd nullState trueState falseState stringState numberState ) func ParseReader(in io.Reader, vs structform.Visitor) (int64, error) { p := NewParser(vs) i, err := io.Copy(p, in) if err == nil { err = p.finalize() } return i, err } func Parse(b []byte, vs structform.Visitor) error { return NewParser(vs).Parse(b) } func ParseString(str string, vs structform.Visitor) error { return NewParser(vs).ParseString(str) } func NewParser(vs structform.Visitor) *Parser { p := &Parser{} p.init(vs) return p } func (p *Parser) init(vs structform.Visitor) { *p = Parser{ visitor: vs, strVisitor: structform.MakeStringRefVisitor(vs), currentState: startState, } p.states = p.statesBuf[:0] p.literalBuffer = p.literalBuffer0[:0] } func (p *Parser) Parse(b []byte) error { p.states = p.states[:0] p.literalBuffer = p.literalBuffer[:0] p.currentState = startState p.err = p.feed(b) if p.err == nil { p.err = p.finalize() } return p.err } func (p *Parser) ParseString(str string) error { return p.Parse(str2Bytes(str)) } func (p *Parser) Write(b []byte) (int, error) { p.err = p.feed(b) if p.err != nil { return 0, p.err } return len(b), nil } func (p *Parser) feed(b []byte) error { for len(b) > 0 { n, _, err := p.feedUntil(b) if err != nil { return err } b = b[n:] } return nil } func (p *Parser) feedUntil(b []byte) (int, bool, error) { var ( err error reported bool orig = b ) for !reported && len(b) > 0 { switch p.currentState { case failedState: if p.err == nil { p.err = errors.New("invalid parser state") } return 0, false, p.err case startState: b, reported, err = p.stepStart(b) case dictState: b, reported, err = p.stepDict(b, true) case dictNextFieldState: b, reported, err = p.stepDict(b, false) case dictFieldState: b, err = p.stepDictKey(b) case dictFieldValueSep: if b = trimLeft(b); len(b) > 0 { if b[0] != ':' { err = errExpectColon } b = b[1:] p.currentState = dictFieldValue } case dictFieldValue: b, reported, err = p.stepValue(b, dictFieldStateEnd) case dictFieldStateEnd: b, reported, err = p.stepDictValueEnd(b) case arrState: b, reported, err = p.stepArray(b, true) case arrStateValue: b, _, err = p.stepValue(b, arrStateNext) case arrStateNext: b, reported, err = p.stepArrValueEnd(b) case nullState: b, reported, err = p.stepNULL(b) case trueState: b, reported, err = p.stepTRUE(b) case falseState: b, reported, err = p.stepFALSE(b) case stringState: b, reported, err = p.stepString(b) case numberState: b, reported, err = p.stepNumber(b) default: return 0, false, errFailing } if err != nil { break } reported = reported && len(p.states) == 0 } consumed := len(orig) - len(b) return consumed, reported, err } func (p *Parser) finalize() error { if p.currentState == numberState { err := p.reportNumber(p.literalBuffer, p.isDouble) if err != nil { return err } p.popState() } if len(p.states) > 0 && p.currentState != startState { return errIncomplete } return nil } func (p *Parser) pushState(next state) { if p.currentState != failedState { p.states = append(p.states, p.currentState) } p.currentState = next } func (p *Parser) popState() { if len(p.states) == 0 { p.currentState = failedState } else { last := len(p.states) - 1 p.currentState = p.states[last] p.states = p.states[:last] } } func (p *Parser) stepStart(b []byte) ([]byte, bool, error) { return p.stepValue(b, p.currentState) } func (p *Parser) stepValue(b []byte, retState state) ([]byte, bool, error) { b = trimLeft(b) if len(b) == 0 { return b, false, nil } p.currentState = retState c := b[0] switch c { case '{': // start dictionary p.pushState(dictState) return b[1:], false, p.visitor.OnObjectStart(-1, structform.AnyType) case '[': // start array p.pushState(arrState) return b[1:], false, p.visitor.OnArrayStart(-1, structform.AnyType) case 'n': // parse "null" p.pushState(nullState) p.required = 3 return p.stepNULL(b[1:]) case 'f': // parse "false" p.pushState(falseState) p.required = 4 return p.stepFALSE(b[1:]) case 't': // parse "true" p.pushState(trueState) p.required = 3 return p.stepTRUE(b[1:]) case '"': // parse string p.literalBuffer = p.literalBuffer[:0] p.pushState(stringState) p.inEscape = false return p.stepString(b[:]) default: // parse number? p.isDouble = false isNumber := c == '-' || c == '+' || c == '.' || isDigit(c) if !isNumber { return b, false, errUnknownChar } p.literalBuffer = p.literalBuffer0[:0] p.pushState(numberState) p.isDouble = false return p.stepNumber(b) } } func (p *Parser) stepDict(b []byte, allowEnd bool) ([]byte, bool, error) { b = trimLeft(b) if len(b) == 0 { return b, false, nil } c := b[0] switch c { case '}': if !allowEnd { return nil, false, errUnexpectedDictClose } return p.endDict(b) case '"': p.currentState = dictFieldState return b, false, nil default: return nil, false, errExpectedFieldName } } func (p *Parser) stepDictKey(b []byte) ([]byte, error) { ref, allocated, done, b, err := p.doString(b) if done && err == nil { p.currentState = dictFieldValueSep if !allocated { err = p.strVisitor.OnKeyRef(ref) } else { err = p.visitor.OnKey(bytes2Str(ref)) } } return b, err } func (p *Parser) stepDictValueEnd(b []byte) ([]byte, bool, error) { b = trimLeft(b) if len(b) == 0 { return b, false, nil } c := b[0] switch c { case '}': return p.endDict(b) case ',': p.currentState = dictNextFieldState return b[1:], false, nil default: return nil, false, errUnknownChar } } func (p *Parser) endDict(b []byte) ([]byte, bool, error) { p.popState() return b[1:], true, p.visitor.OnObjectFinished() } func (p *Parser) stepArray(b []byte, allowEnd bool) ([]byte, bool, error) { b = trimLeft(b) if len(b) == 0 { return b, false, nil } c := b[0] switch c { case ']': if !allowEnd { return nil, false, errUnexpectedArrClose } return p.endArray(b) } p.currentState = arrStateValue return b, false, nil } func (p *Parser) stepArrValueEnd(b []byte) ([]byte, bool, error) { b = trimLeft(b) if len(b) == 0 { return b, false, nil } c := b[0] switch c { case ']': return p.endArray(b) case ',': p.currentState = arrStateValue return b[1:], false, nil default: return nil, false, errUnknownChar } } func (p *Parser) endArray(b []byte) ([]byte, bool, error) { p.popState() return b[1:], true, p.visitor.OnArrayFinished() } func (p *Parser) stepString(b []byte) ([]byte, bool, error) { ref, allocated, done, b, err := p.doString(b) if done && err == nil { p.popState() if !allocated { err = p.strVisitor.OnStringRef(ref) } else { err = p.visitor.OnString(bytes2Str(ref)) } } return b, done, err } func (p *Parser) doString(b []byte) ([]byte, bool, bool, []byte, error) { stop := -1 done := false delta := 1 buf := b atStart := len(p.literalBuffer) == 0 if atStart { delta = 2 buf = b[1:] } inEscape := p.inEscape for i, c := range buf { if inEscape { inEscape = false continue } if c == '"' { done = true stop = i + delta break } else if c == '\\' { inEscape = true } } p.inEscape = inEscape if !done { p.literalBuffer = append(p.literalBuffer, b...) return nil, false, false, nil, nil } rest := b[stop:] b = b[:stop] if len(p.literalBuffer) > 0 { b = append(p.literalBuffer, b...) p.literalBuffer = b[:0] // reset buffer } var err error var allocated bool b = b[1 : len(b)-1] b, allocated, err = p.unquote(b) if err != nil { return nil, false, false, nil, err } return b, allocated, done, rest, nil } func (p *Parser) unquote(in []byte) ([]byte, bool, error) { if len(in) == 0 { return in, false, nil } // Check for unusual characters and escape sequence. If none is found, // return slice as is: i := 0 for i < len(in) { c := in[i] if c == '\\' || c == '"' || c < ' ' { break } if c < utf8.RuneSelf { i++ continue } r, sz := utf8.DecodeRune(in[i:]) if r == utf8.RuneError && sz == 1 { break } i += sz } // no special character found -> return as is if i == len(in) { return in, false, nil } // found escape character (or other unusual character) -> // allocate output buffer (try to use literalBuffer) out := p.literalBuffer[:0] allocated := false utf8Delta := 2 * utf8.UTFMax minLen := len(in) + utf8Delta if cap(out) < minLen { // TODO: is minLen < some upper bound, store in literalBuffer out = make([]byte, minLen) allocated = true } else { out = out[:minLen] } // init output buffer written := copy(out, in[:i]) for i < len(in) { if written > len(out)-utf8Delta { // out of room -> increase write buffer newLen := len(out) * 2 if cap(out) < newLen { tmp := make([]byte, len(out)*2) copy(tmp, out[:written]) out = tmp allocated = true } else { out = out[:newLen] } } c := in[i] switch { case c == '\\': i++ if i >= len(in) { return nil, false, errUnquoteInEscape } switch in[i] { default: return nil, false, errUnquoteUnknownEscape case '"', '\\', '/', '\'': out[written] = in[i] i++ written++ case 'b': out[written] = '\b' i++ written++ case 'f': out[written] = '\f' i++ written++ case 'n': out[written] = '\n' i++ written++ case 'r': out[written] = '\r' i++ written++ case 't': out[written] = '\t' i++ written++ case 'u': i++ code, err := strconv.ParseUint(string(in[i:i+4]), 16, 64) if err != nil { return nil, false, errUnquoteInvalidUnicode } i += 4 r := rune(code) if utf16.IsSurrogate(r) { var dec rune = unicode.ReplacementChar valid := in[i] == '\\' && in[i+1] == 'u' if valid { code, err := strconv.ParseUint(string(in[i+2:i+6]), 16, 64) if err == nil { dec = utf16.DecodeRune(r, rune(code)) if dec != unicode.ReplacementChar { i += 6 } } } r = dec } written += utf8.EncodeRune(out[written:], r) } case c == '"', c < ' ': return nil, false, errUnquoteInvalidChar case c < utf8.RuneSelf: out[written] = c i++ written++ default: _, sz := utf8.DecodeRune(in[i:]) i += sz written += copy(out[written:], in[i:i+sz]) } } return out[:written], allocated, nil } func (p *Parser) stepNumber(b []byte) ([]byte, bool, error) { // search for char in stop-set stop := -1 done := false for i, c := range b { isStopChar := c == ' ' || c == '\t' || c == '\f' || c == '\n' || c == '\r' || c == ',' || c == ']' || c == '}' if isStopChar { stop = i done = true break } p.isDouble = p.isDouble || c == '.' || c == 'e' || c == 'E' } if !done { p.literalBuffer = append(p.literalBuffer, b...) return nil, false, nil } rest := b[stop:] b = b[:stop] if len(p.literalBuffer) > 0 { b = append(p.literalBuffer, b...) p.literalBuffer = b[:0] // reset buffer } err := p.reportNumber(b, p.isDouble) p.popState() return rest, true, err } func (p *Parser) reportNumber(b []byte, isDouble bool) error { // parse number var err error if isDouble { var f float64 if f, err = strconv.ParseFloat(bytes2Str(b), 64); err == nil { err = p.visitor.OnFloat64(f) } } else { var i int64 if i, err = parseInt(b); err == nil { err = p.visitor.OnInt64(i) } } return err } func parseInt(b []byte) (int64, error) { neg := false if b[0] == '+' { b = b[1:] } else if b[0] == '-' { neg = true b = b[1:] } u, err := parseUint(b) n := int64(u) if neg { n = -n } return n, err } func parseUint(b []byte) (uint64, error) { const cutoff = math.MaxUint64/10 + 1 const maxVal = math.MaxUint64 var n uint64 for _, c := range b { d := int(c) - '0' if d < 0 || d > 9 { return 0, fmt.Errorf("'%s' is no valid number", b) } if n >= cutoff { return 0, fmt.Errorf("number overflow parsing '%v'", b) } n *= 10 n1 := n + uint64(d) if n1 < n || n1 > maxVal { return 0, fmt.Errorf("number overflow parsing '%v'", b) } n = n1 } return n, nil } func (p *Parser) stepNULL(b []byte) ([]byte, bool, error) { b, done, err := p.stepKind(b, []byte("null"), errExpectedNull) if done { err = p.visitor.OnNil() } return b, done, err } func (p *Parser) stepTRUE(b []byte) ([]byte, bool, error) { b, done, err := p.stepKind(b, []byte("true"), errExpectedTrue) if done { err = p.visitor.OnBool(true) } return b, done, err } func (p *Parser) stepFALSE(b []byte) ([]byte, bool, error) { b, done, err := p.stepKind(b, []byte("false"), errExpectedFalse) if done { err = p.visitor.OnBool(false) } return b, done, err } func (p *Parser) stepKind(b []byte, kind []byte, err error) ([]byte, bool, error) { n := p.required s := kind[len(kind)-n:] done := true if L := len(b); L < n { done = false p.required = n - L n = L s = s[:L] } if !bytes.HasPrefix(b, s) { return b, false, err } if done { p.popState() } return b[n:], done, nil } func isDigit(c byte) bool { return '0' <= c && c <= '9' } func trimLeft(b []byte) []byte { for i, c := range b { if !unicode.IsSpace(rune(c)) { return b[i:] } } return nil } var whitespace = " \t\r\n"