Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace strings.Trim with strings.TrimFunc in ParseRDF #5494

Merged
merged 3 commits into from
May 26, 2020

Conversation

ashish-goswami
Copy link
Contributor

@ashish-goswami ashish-goswami commented May 21, 2020

This PR replaces strings.Trim(item.Val, " ") with strings.TrimFunc(item.Val, isSpaceRune) in ParseRDF(). isSpaceRune looks like below:

func isSpaceRune(r rune) bool {
     return r == ' '
}

Benchmarks for different ways of Trimming a string:

func BenchmarkTrim(b *testing.B) {
	s := "      abcdefghi      "
	//fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.Trim(s, " ")
	}

	_ = trimmed
}

func BenchmarkTrimfunc(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return r == ' '
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimfuncUnicode(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return unicode.IsSpace(r)
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimSpace(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimSpace(s)
	}

	_ = trimmed
}

Results:

goos: linux
goarch: amd64
pkg: github.com/dgraph-io/21million
BenchmarkTrim-8              	11225523	       104 ns/op	      32 B/op	       1 allocs/op
BenchmarkTrimfunc-8          	21282284	        59.1 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimfuncUnicode-8   	12513160	        96.9 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimSpace-8         	63749401	        19.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	github.com/dgraph-io/21million	5.142s

We can see strings.TrimSpace() outperforms all in normal case. It will perform same as BenchmarkTrimfuncUnicode in case of non-ascii chars in string.
However strings.TrimSpace() removes all whitespaces like \t, \n etc, which might change our existing behaviour. Hence we are not using it for now.

Allocations shows up while live loading data on large dataset(~1 billion RDFs).
Before Profile:

   25.79GB    70.96GB (flat, cum) 21.54% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.39GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .    12.90GB    100:			rnq.Subject = strings.Trim(item.Val, " ")
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .    12.88GB    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.Trim(item.Val, " "))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.Trim(item.Val, " ")
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.Trim(item.Val, " ")
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if strings.Trim(val, " ") == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.Trim(item.Val, " ")
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
   25.79GB    25.79GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {

This PR:

 25.79GB    45.20GB (flat, cum) 14.00% of Total
         .          .     84:	if len(line) == 0 {
         .          .     85:		return rnq, ErrEmpty
         .          .     86:	}
         .          .     87:
         .          .     88:	l.Reset(line)
         .    19.42GB     89:	l.Run(lexText)
         .          .     90:	if err := l.ValidateResult(); err != nil {
         .          .     91:		return rnq, err
         .          .     92:	}
         .          .     93:	it := l.NewIterator()
         .          .     94:	var oval string
         .          .     95:	var seenOval bool
         .          .     96:	var vend bool
         .          .     97:	isCommentLine := false
         .          .     98:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     99:L:
         .          .    100:	for it.Next() {
         .          .    101:		item := it.Item()
         .          .    102:		switch item.Typ {
         .          .    103:		case itemSubject:
         .          .    104:			rnq.Subject = strings.TrimFunc(item.Val, isSpaceRune)
         .          .    105:
         .          .    106:		case itemSubjectFunc:
         .          .    107:			var err error
         .          .    108:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    109:				return rnq, err
         .          .    110:			}
         .          .    111:
         .          .    112:		case itemObjectFunc:
         .          .    113:			var err error
         .          .    114:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    115:				return rnq, err
         .          .    116:			}
         .          .    117:
         .          .    118:		case itemPredicate:
         .          .    119:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .          .    120:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.TrimFunc(item.Val, isSpaceRune))
         .          .    121:
         .          .    122:		case itemObject:
         .          .    123:			rnq.ObjectId = strings.TrimFunc(item.Val, isSpaceRune)
         .          .    124:
         .          .    125:		case itemStar:
         .          .    126:			switch {
         .          .    127:			case rnq.Subject == "":
         .          .    128:				rnq.Subject = x.Star
         .          .    129:			case rnq.Predicate == "":
         .          .    130:				rnq.Predicate = x.Star
         .          .    131:			default:
         .          .    132:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    133:			}
         .          .    134:
         .          .    135:		case itemLiteral:
         .          .    136:			var err error
         .          .    137:			oval, err = strconv.Unquote(item.Val)
         .          .    138:			if err != nil {
         .          .    139:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    140:			}
         .          .    141:			seenOval = true
         .          .    142:
         .          .    143:		case itemLanguage:
         .          .    144:			rnq.Lang = item.Val
         .          .    145:
         .          .    146:		case itemObjectType:
         .          .    147:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    148:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    149:			}
         .          .    150:
         .          .    151:			val := strings.TrimFunc(item.Val, isSpaceRune)
         .          .    152:			// TODO: Check if this condition is required.
         .          .    153:			if val == "*" {
         .          .    154:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    155:			}
         .          .    156:			// Lets find out the storage type from the type map.
         .          .    157:			t, ok := typeMap[val]
         .          .    158:			if !ok {
         .          .    159:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    160:			}
         .          .    161:			if oval == "" && t != types.StringID {
         .          .    162:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    163:			}
         .          .    164:			src := types.ValueForType(types.StringID)
         .          .    165:			src.Value = []byte(oval)
         .          .    166:			// if this is a password value dont re-encrypt. issue#2765
         .          .    167:			if t == types.PasswordID {
         .          .    168:				src.Tid = t
         .          .    169:			}
         .          .    170:			p, err := types.Convert(src, t)
         .          .    171:			if err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:
         .          .    175:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    176:				return rnq, err
         .          .    177:			}
         .          .    178:		case itemComment:
         .          .    179:			isCommentLine = true
         .          .    180:			vend = true
         .          .    181:
         .          .    182:		case itemValidEnd:
         .          .    183:			vend = true
         .          .    184:			if !it.Next() {
         .          .    185:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    186:			}
         .          .    187:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    188:			// by newline already. We should get EOF or # after dot(.)
         .          .    189:			item = it.Item()
         .          .    190:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    191:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    192:					" Input: [%s]", line)
         .          .    193:			}
         .          .    194:			break L
         .          .    195:
         .          .    196:		case itemLabel:
         .          .    197:			rnq.Label = strings.TrimSpace(item.Val)
         .          .    198:
         .          .    199:		case itemLeftRound:
         .          .    200:			it.Prev() // backup '('
         .          .    201:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    202:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    203:			}
         .          .    204:		}
         .          .    205:	}
         .          .    206:
         .          .    207:	if !vend {
         .          .    208:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    209:	}
         .          .    210:	if isCommentLine {
         .          .    211:		return rnq, ErrEmpty
         .          .    212:	}
         .          .    213:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    214:	// already set it.
         .          .    215:	if seenOval && rnq.ObjectValue == nil {
   25.79GB    25.79GB    216:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    217:	}
         .          .    218:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    219:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    220:	}
         .          .    221:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {

This change is Reviewable

Docs Preview: Dgraph Preview

@ashish-goswami ashish-goswami changed the title Repalce strings.Trim with strings.TrimSpace in ParseRDF Replace strings.Trim with strings.TrimSpace in ParseRDF May 21, 2020
Copy link
Contributor

@manishrjain manishrjain left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:lgtm:

Reviewed 1 of 1 files at r1.
Reviewable status: :shipit: complete! all files reviewed, all discussions resolved (waiting on @vvbalaji-dgraph)

TrimSpace is faster but it removes all whitespaces
like tab, newlines etc. Using TrimSpace might change
existing behaviour. This commit replaces TrimSpace with
TrimFunc. TrimFunc is passed isSpaceRune function to avoid
alloctions.
@ashish-goswami ashish-goswami changed the title Replace strings.Trim with strings.TrimSpace in ParseRDF Replace strings.Trim with strings.TrimFunc in ParseRDF May 26, 2020
@ashish-goswami ashish-goswami merged commit db3d851 into master May 26, 2020
@ashish-goswami ashish-goswami deleted the ashish/parser branch May 26, 2020 11:25
ashish-goswami added a commit that referenced this pull request May 26, 2020
This PR replaces strings.Trim(item.Val, " ") with strings.TrimFunc(item.Val, isSpaceRune) in ParseRDF(). isSpaceRune looks like below:

func isSpaceRune(r rune) bool {
     return r == ' '
}
Benchmarks for different ways of Trimming a string:

func BenchmarkTrim(b *testing.B) {
	s := "      abcdefghi      "
	//fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.Trim(s, " ")
	}

	_ = trimmed
}

func BenchmarkTrimfunc(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return r == ' '
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimfuncUnicode(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return unicode.IsSpace(r)
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimSpace(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimSpace(s)
	}

	_ = trimmed
}
Results:

goos: linux
goarch: amd64
pkg: github.com/dgraph-io/21million
BenchmarkTrim-8              	11225523	       104 ns/op	      32 B/op	       1 allocs/op
BenchmarkTrimfunc-8          	21282284	        59.1 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimfuncUnicode-8   	12513160	        96.9 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimSpace-8         	63749401	        19.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	github.com/dgraph-io/21million	5.142s
We can see strings.TrimSpace() outperforms all in normal case. It will perform same as BenchmarkTrimfuncUnicode in case of non-ascii chars in string.
However strings.TrimSpace() removes all whitespaces like \t, \n etc, which might change our existing behaviour. Hence we are not using it for now.

Allocations shows up while live loading data on large dataset(~1 billion RDFs).
Before Profile:

   25.79GB    70.96GB (flat, cum) 21.54% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.39GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .    12.90GB    100:			rnq.Subject = strings.Trim(item.Val, " ")
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .    12.88GB    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.Trim(item.Val, " "))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.Trim(item.Val, " ")
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.Trim(item.Val, " ")
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if strings.Trim(val, " ") == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.Trim(item.Val, " ")
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
   25.79GB    25.79GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {
This PR:

      26GB    45.50GB (flat, cum) 14.15% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.49GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .          .    100:			rnq.Subject = strings.TrimSpace(item.Val)
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .          .    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.TrimSpace(item.Val))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.TrimSpace(item.Val)
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.TrimSpace(item.Val)
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if val == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.TrimSpace(item.Val)
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
      26GB       26GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {

(cherry picked from commit db3d851)
ashish-goswami added a commit that referenced this pull request May 26, 2020
This PR replaces strings.Trim(item.Val, " ") with strings.TrimFunc(item.Val, isSpaceRune) in ParseRDF(). isSpaceRune looks like below:

func isSpaceRune(r rune) bool {
     return r == ' '
}
Benchmarks for different ways of Trimming a string:

func BenchmarkTrim(b *testing.B) {
	s := "      abcdefghi      "
	//fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.Trim(s, " ")
	}

	_ = trimmed
}

func BenchmarkTrimfunc(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return r == ' '
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimfuncUnicode(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return unicode.IsSpace(r)
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimSpace(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimSpace(s)
	}

	_ = trimmed
}
Results:

goos: linux
goarch: amd64
pkg: github.com/dgraph-io/21million
BenchmarkTrim-8              	11225523	       104 ns/op	      32 B/op	       1 allocs/op
BenchmarkTrimfunc-8          	21282284	        59.1 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimfuncUnicode-8   	12513160	        96.9 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimSpace-8         	63749401	        19.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	github.com/dgraph-io/21million	5.142s
We can see strings.TrimSpace() outperforms all in normal case. It will perform same as BenchmarkTrimfuncUnicode in case of non-ascii chars in string.
However strings.TrimSpace() removes all whitespaces like \t, \n etc, which might change our existing behaviour. Hence we are not using it for now.

Allocations shows up while live loading data on large dataset(~1 billion RDFs).
Before Profile:

   25.79GB    70.96GB (flat, cum) 21.54% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.39GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .    12.90GB    100:			rnq.Subject = strings.Trim(item.Val, " ")
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .    12.88GB    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.Trim(item.Val, " "))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.Trim(item.Val, " ")
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.Trim(item.Val, " ")
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if strings.Trim(val, " ") == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.Trim(item.Val, " ")
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
   25.79GB    25.79GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {
This PR:

      26GB    45.50GB (flat, cum) 14.15% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.49GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .          .    100:			rnq.Subject = strings.TrimSpace(item.Val)
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .          .    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.TrimSpace(item.Val))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.TrimSpace(item.Val)
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.TrimSpace(item.Val)
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if val == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.TrimSpace(item.Val)
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
      26GB       26GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {

(cherry picked from commit db3d851)
dna2github pushed a commit to dna2fork/dgraph that referenced this pull request Jul 18, 2020
This PR replaces strings.Trim(item.Val, " ") with strings.TrimFunc(item.Val, isSpaceRune) in ParseRDF(). isSpaceRune looks like below:

func isSpaceRune(r rune) bool {
     return r == ' '
}
Benchmarks for different ways of Trimming a string:

func BenchmarkTrim(b *testing.B) {
	s := "      abcdefghi      "
	//fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.Trim(s, " ")
	}

	_ = trimmed
}

func BenchmarkTrimfunc(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return r == ' '
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimfuncUnicode(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	f := func(r rune) bool {
		return unicode.IsSpace(r)
	}

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimFunc(s, f)
	}

	_ = trimmed
}

func BenchmarkTrimSpace(b *testing.B) {
	s := "      abcdefghi      "
	// fmt.Println("len(s): ", len(s))

	var trimmed string
	for i := 0; i < b.N; i++ {
		trimmed = strings.TrimSpace(s)
	}

	_ = trimmed
}
Results:

goos: linux
goarch: amd64
pkg: github.com/dgraph-io/21million
BenchmarkTrim-8              	11225523	       104 ns/op	      32 B/op	       1 allocs/op
BenchmarkTrimfunc-8          	21282284	        59.1 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimfuncUnicode-8   	12513160	        96.9 ns/op	       0 B/op	       0 allocs/op
BenchmarkTrimSpace-8         	63749401	        19.0 ns/op	       0 B/op	       0 allocs/op
PASS
ok  	github.com/dgraph-io/21million	5.142s
We can see strings.TrimSpace() outperforms all in normal case. It will perform same as BenchmarkTrimfuncUnicode in case of non-ascii chars in string.
However strings.TrimSpace() removes all whitespaces like \t, \n etc, which might change our existing behaviour. Hence we are not using it for now.

Allocations shows up while live loading data on large dataset(~1 billion RDFs).
Before Profile:

   25.79GB    70.96GB (flat, cum) 21.54% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.39GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .    12.90GB    100:			rnq.Subject = strings.Trim(item.Val, " ")
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .    12.88GB    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.Trim(item.Val, " "))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.Trim(item.Val, " ")
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.Trim(item.Val, " ")
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if strings.Trim(val, " ") == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.Trim(item.Val, " ")
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
   25.79GB    25.79GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {
This PR:

      26GB    45.50GB (flat, cum) 14.15% of Total
         .          .     80:	if len(line) == 0 {
         .          .     81:		return rnq, ErrEmpty
         .          .     82:	}
         .          .     83:
         .          .     84:	l.Reset(line)
         .    19.49GB     85:	l.Run(lexText)
         .          .     86:	if err := l.ValidateResult(); err != nil {
         .          .     87:		return rnq, err
         .          .     88:	}
         .          .     89:	it := l.NewIterator()
         .          .     90:	var oval string
         .          .     91:	var seenOval bool
         .          .     92:	var vend bool
         .          .     93:	isCommentLine := false
         .          .     94:	// We read items from the l.Items channel to which the lexer sends items.
         .          .     95:L:
         .          .     96:	for it.Next() {
         .          .     97:		item := it.Item()
         .          .     98:		switch item.Typ {
         .          .     99:		case itemSubject:
         .          .    100:			rnq.Subject = strings.TrimSpace(item.Val)
         .          .    101:
         .          .    102:		case itemSubjectFunc:
         .          .    103:			var err error
         .          .    104:			if rnq.Subject, err = parseFunction(it); err != nil {
         .          .    105:				return rnq, err
         .          .    106:			}
         .          .    107:
         .          .    108:		case itemObjectFunc:
         .          .    109:			var err error
         .          .    110:			if rnq.ObjectId, err = parseFunction(it); err != nil {
         .          .    111:				return rnq, err
         .          .    112:			}
         .          .    113:
         .          .    114:		case itemPredicate:
         .          .    115:			// Here we split predicate and lang directive (ex: "name@en"), if needed.
         .          .    116:			rnq.Predicate, rnq.Lang = x.PredicateLang(strings.TrimSpace(item.Val))
         .          .    117:
         .          .    118:		case itemObject:
         .          .    119:			rnq.ObjectId = strings.TrimSpace(item.Val)
         .          .    120:
         .          .    121:		case itemStar:
         .          .    122:			switch {
         .          .    123:			case rnq.Subject == "":
         .          .    124:				rnq.Subject = x.Star
         .          .    125:			case rnq.Predicate == "":
         .          .    126:				rnq.Predicate = x.Star
         .          .    127:			default:
         .          .    128:				rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: x.Star}}
         .          .    129:			}
         .          .    130:
         .          .    131:		case itemLiteral:
         .          .    132:			var err error
         .          .    133:			oval, err = strconv.Unquote(item.Val)
         .          .    134:			if err != nil {
         .          .    135:				return rnq, errors.Wrapf(err, "while unquoting")
         .          .    136:			}
         .          .    137:			seenOval = true
         .          .    138:
         .          .    139:		case itemLanguage:
         .          .    140:			rnq.Lang = item.Val
         .          .    141:
         .          .    142:		case itemObjectType:
         .          .    143:			if rnq.Predicate == x.Star || rnq.Subject == x.Star {
         .          .    144:				return rnq, errors.Errorf("If predicate/subject is *, value should be * as well")
         .          .    145:			}
         .          .    146:
         .          .    147:			val := strings.TrimSpace(item.Val)
         .          .    148:			// TODO: Check if this condition is required.
         .          .    149:			if val == "*" {
         .          .    150:				return rnq, errors.Errorf("itemObject can't be *")
         .          .    151:			}
         .          .    152:			// Lets find out the storage type from the type map.
         .          .    153:			t, ok := typeMap[val]
         .          .    154:			if !ok {
         .          .    155:				return rnq, errors.Errorf("Unrecognized rdf type %s", val)
         .          .    156:			}
         .          .    157:			if oval == "" && t != types.StringID {
         .          .    158:				return rnq, errors.Errorf("Invalid ObjectValue")
         .          .    159:			}
         .          .    160:			src := types.ValueForType(types.StringID)
         .          .    161:			src.Value = []byte(oval)
         .          .    162:			// if this is a password value dont re-encrypt. issue#2765
         .          .    163:			if t == types.PasswordID {
         .          .    164:				src.Tid = t
         .          .    165:			}
         .          .    166:			p, err := types.Convert(src, t)
         .          .    167:			if err != nil {
         .          .    168:				return rnq, err
         .          .    169:			}
         .          .    170:
         .          .    171:			if rnq.ObjectValue, err = types.ObjectValue(t, p.Value); err != nil {
         .          .    172:				return rnq, err
         .          .    173:			}
         .          .    174:		case itemComment:
         .          .    175:			isCommentLine = true
         .          .    176:			vend = true
         .          .    177:
         .          .    178:		case itemValidEnd:
         .          .    179:			vend = true
         .          .    180:			if !it.Next() {
         .          .    181:				return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    182:			}
         .          .    183:			// RDF spec says N-Quads should be terminated with a newline. Since we break the input
         .          .    184:			// by newline already. We should get EOF or # after dot(.)
         .          .    185:			item = it.Item()
         .          .    186:			if !(item.Typ == lex.ItemEOF || item.Typ == itemComment) {
         .          .    187:				return rnq, errors.Errorf("Invalid end of input. Expected newline or # after ."+
         .          .    188:					" Input: [%s]", line)
         .          .    189:			}
         .          .    190:			break L
         .          .    191:
         .          .    192:		case itemLabel:
         .          .    193:			rnq.Label = strings.TrimSpace(item.Val)
         .          .    194:
         .          .    195:		case itemLeftRound:
         .          .    196:			it.Prev() // backup '('
         .          .    197:			if err := parseFacetsRDF(it, &rnq); err != nil {
         .          .    198:				return rnq, errors.Wrap(err, "could not parse facet")
         .          .    199:			}
         .          .    200:		}
         .          .    201:	}
         .          .    202:
         .          .    203:	if !vend {
         .          .    204:		return rnq, errors.Errorf("Invalid end of input. Input: [%s]", line)
         .          .    205:	}
         .          .    206:	if isCommentLine {
         .          .    207:		return rnq, ErrEmpty
         .          .    208:	}
         .          .    209:	// We only want to set default value if we have seen ObjectValue within "" and if we didn't
         .          .    210:	// already set it.
         .          .    211:	if seenOval && rnq.ObjectValue == nil {
      26GB       26GB    212:		rnq.ObjectValue = &api.Value{Val: &api.Value_DefaultVal{DefaultVal: oval}}
         .          .    213:	}
         .          .    214:	if len(rnq.Subject) == 0 || len(rnq.Predicate) == 0 {
         .          .    215:		return rnq, errors.Errorf("Empty required fields in NQuad. Input: [%s]", line)
         .          .    216:	}
         .          .    217:	if len(rnq.ObjectId) == 0 && rnq.ObjectValue == nil {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging this pull request may close these issues.

2 participants