dgraph/cmd/live/run.go

/*
 * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package live

import (
	"bufio"
	"compress/gzip"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"io/ioutil"
	"math"
	"math/rand"
	"net/http"
	_ "net/http/pprof" // http profiler
	"os"
	"sort"
	"strconv"
	"strings"
	"sync"
	"time"

	"google.golang.org/grpc/metadata"

	"github.com/dgraph-io/badger/v2"
	bopt "github.com/dgraph-io/badger/v2/options"
	"github.com/dgraph-io/dgo/v200"
	"github.com/dgraph-io/dgo/v200/protos/api"

	"github.com/dgraph-io/dgraph/chunker"
	"github.com/dgraph-io/dgraph/ee/enc"
	"github.com/dgraph-io/dgraph/types"
	"github.com/dgraph-io/dgraph/x"
	"github.com/dgraph-io/dgraph/xidmap"

	"github.com/golang/glog"
	"github.com/pkg/errors"
	"github.com/spf13/cobra"
)

type options struct {
	dataFiles      string
	dataFormat     string
	schemaFile     string
	zero           string
	concurrent     int
	batchSize      int
	clientDir      string
	authToken      string
	useCompression bool
	newUids        bool
	verbose        bool
	httpAddr       string
	bufferSize     int
	ludicrousMode  bool
	upsert         string
	key            x.SensitiveByteSlice
}

type predicate struct {
	Predicate  string   `json:"predicate,omitempty"`
	Type       string   `json:"type,omitempty"`
	Tokenizer  []string `json:"tokenizer,omitempty"`
	Count      bool     `json:"count,omitempty"`
	List       bool     `json:"list,omitempty"`
	Lang       bool     `json:"lang,omitempty"`
	Index      bool     `json:"index,omitempty"`
	Upsert     bool     `json:"upsert,omitempty"`
	Reverse    bool     `json:"reverse,omitempty"`
	NoConflict bool     `json:"no_conflict,omitempty"`
	ValueType  types.TypeID
}

type schema struct {
	Predicates []*predicate `json:"schema,omitempty"`
	preds      map[string]*predicate
}

type request struct {
	*api.Mutation
	conflicts []uint64
	query     string
}

func (l *schema) init() {
	l.preds = make(map[string]*predicate)
	for _, i := range l.Predicates {
		i.ValueType, _ = types.TypeForName(i.Type)
		l.preds[i.Predicate] = i
	}
}

var (
	opt options
	sch schema

	// Live is the sub-command invoked when running "dgraph live".
	Live x.SubCommand
)

func init() {
	Live.Cmd = &cobra.Command{
		Use:   "live",
		Short: "Run Dgraph live loader",
		Run: func(cmd *cobra.Command, args []string) {
			defer x.StartProfile(Live.Conf).Stop()
			if err := run(); err != nil {
				x.Check2(fmt.Fprintf(os.Stderr, "%s", err.Error()))
				os.Exit(1)
			}
		},
	}
	Live.EnvPrefix = "DGRAPH_LIVE"

	flag := Live.Cmd.Flags()
	flag.StringP("files", "f", "", "Location of *.rdf(.gz) or *.json(.gz) file(s) to load")
	flag.StringP("schema", "s", "", "Location of schema file")
	flag.String("format", "", "Specify file format (rdf or json) instead of getting it "+
		"from filename")
	flag.StringP("alpha", "a", "127.0.0.1:9080",
		"Comma-separated list of Dgraph alpha gRPC server addresses")
	flag.StringP("zero", "z", "127.0.0.1:5080", "Dgraph zero gRPC server address")
	flag.IntP("conc", "c", 10,
		"Number of concurrent requests to make to Dgraph")
	flag.IntP("batch", "b", 1000,
		"Number of N-Quads to send as part of a mutation.")
	flag.StringP("xidmap", "x", "", "Directory to store xid to uid mapping")
	flag.StringP("auth_token", "t", "",
		"The auth token passed to the server for Alter operation of the schema file")
	flag.BoolP("use_compression", "C", false,
		"Enable compression on connection to alpha server")
	flag.Bool("new_uids", false,
		"Ignore UIDs in load files and assign new ones.")
	flag.String("http", "localhost:6060", "Address to serve http (pprof).")
	flag.Bool("verbose", false, "Run the live loader in verbose mode")
	flag.StringP("user", "u", "", "Username if login is required.")
	flag.StringP("password", "p", "", "Password of the user.")
	flag.StringP("bufferSize", "m", "100", "Buffer for each thread")
	flag.Bool("ludicrous_mode", false, "Run live loader in ludicrous mode (Should "+
		"only be done when alpha is under ludicrous mode)")
	flag.StringP("upsert", "U", "", "run in upsert mode")

	// Encryption and Vault options
	enc.RegisterFlags(flag)
	// TLS configuration
	x.RegisterClientTLSFlags(flag)
}

func getSchema(ctx context.Context, dgraphClient *dgo.Dgraph) (*schema, error) {
	txn := dgraphClient.NewTxn()
	defer txn.Discard(ctx)

	res, err := txn.Query(ctx, "schema {}")
	if err != nil {
		return nil, err
	}

	err = json.Unmarshal(res.GetJson(), &sch)
	if err != nil {
		return nil, err
	}
	sch.init()
	return &sch, nil
}

// processSchemaFile process schema for a given gz file.
func processSchemaFile(ctx context.Context, file string, key x.SensitiveByteSlice,
	dgraphClient *dgo.Dgraph) error {
	fmt.Printf("\nProcessing schema file %q\n", file)
	if len(opt.authToken) > 0 {
		md := metadata.New(nil)
		md.Append("auth-token", opt.authToken)
		ctx = metadata.NewOutgoingContext(ctx, md)
	}

	f, err := os.Open(file)
	x.CheckfNoTrace(err)
	defer f.Close()

	reader, err := enc.GetReader(key, f)
	x.Check(err)
	if strings.HasSuffix(strings.ToLower(file), ".gz") {
		reader, err = gzip.NewReader(reader)
		x.Check(err)
	}

	b, err := ioutil.ReadAll(reader)
	if err != nil {
		x.Checkf(err, "Error while reading file")
	}

	op := &api.Operation{}
	op.Schema = string(b)
	return dgraphClient.Alter(ctx, op)
}

func (l *loader) uid(val string) string {
	// Attempt to parse as a UID (in the same format that dgraph outputs - a
	// hex number prefixed by "0x"). If parsing succeeds, then this is assumed
	// to be an existing node in the graph. There is limited protection against
	// a user selecting an unassigned UID in this way - it may be assigned
	// later to another node. It is up to the user to avoid this.
	if !opt.newUids {
		if uid, err := strconv.ParseUint(val, 0, 64); err == nil {
			return fmt.Sprintf("%#x", uid)
		}
	}

	sb := strings.Builder{}
	x.Check2(sb.WriteString(val))
	uid, _ := l.alloc.AssignUid(sb.String())
	return fmt.Sprintf("%#x", uint64(uid))
}

// allocateUids looks for the maximum uid value in the given NQuads and bumps the
// maximum seen uid to that value.
func (l *loader) allocateUids(nqs []*api.NQuad) {
	if opt.newUids {
		return
	}

	var maxUid uint64
	for _, nq := range nqs {
		sUid, err := strconv.ParseUint(nq.Subject, 0, 64)
		if err != nil {
			continue
		}
		if sUid > maxUid {
			maxUid = sUid
		}

		oUid, err := strconv.ParseUint(nq.ObjectId, 0, 64)
		if err != nil {
			continue
		}
		if oUid > maxUid {
			maxUid = oUid
		}
	}
	l.alloc.BumpTo(maxUid)
}

// processFile forwards a file to the RDF or JSON processor as appropriate
func (l *loader) processFile(ctx context.Context, filename string, key x.SensitiveByteSlice) error {
	fmt.Printf("Processing data file %q\n", filename)

	rd, cleanup := chunker.FileReader(filename, key)
	defer cleanup()

	loadType := chunker.DataFormat(filename, opt.dataFormat)
	if loadType == chunker.UnknownFormat {
		if isJson, err := chunker.IsJSONData(rd); err == nil {
			if isJson {
				loadType = chunker.JsonFormat
			} else {
				return errors.Errorf("need --format=rdf or --format=json to load %s", filename)
			}
		}
	}

	return l.processLoadFile(ctx, rd, chunker.NewChunker(loadType, opt.batchSize))
}

func (l *loader) addUpsert(req *request) {
	ids := make(map[string]string)
	idx := 0

	for _, i := range req.Mutation.Set {
		idx += 1
		ids[i.Subject] = fmt.Sprintf("u_%d", idx)
		i.Subject = fmt.Sprintf("uid(%s)", ids[i.Subject])
		if len(i.ObjectId) > 0 {
			idx += 1
			ids[i.ObjectId] = fmt.Sprintf("u_%d", idx)
			i.ObjectId = fmt.Sprintf("uid(%s)", ids[i.ObjectId])
		}
	}

	if len(ids) > 0 {
		s := make([]*api.NQuad, 0, len(req.Mutation.Set)+len(ids))
		copy(req.Mutation.Set, s[:])
		req.Mutation.Set = s
	}

	query := ""
	for i, idx := range ids {
		query += fmt.Sprintf(`%s as var(func: eq(%s, "%s"))`, idx, opt.upsert, i)
		query += "\n"
		req.Mutation.Set = append(req.Mutation.Set, &api.NQuad{
			Subject:     fmt.Sprintf("uid(%s)", idx),
			Predicate:   opt.upsert,
			ObjectValue: &api.Value{Val: &api.Value_StrVal{StrVal: i}},
		})
	}

	// We form upsert query for each of the ids we saw in the request, along with adding
	// the corresponding xid to that uid. The mutation we added is only useful if the
	// uid doesn't exists.
	//
	// Example upsert mutation:
	//
	// query {
	//     m.1234 as var(func: eq(xid, "m.1234"))
	// }
	//
	// mutation {
	//     set {
	//          uid(m.1234) <predicate> value .    --> original request
	//          uid(m.1234) xid m.1234 .           --> gets added here
	//     }
	// }
	req.query = "query {\n" + query + "}"
}

func (l *loader) processLoadFile(ctx context.Context, rd *bufio.Reader, ck chunker.Chunker) error {
	var wg sync.WaitGroup
	wg.Add(1)
	nqbuf := ck.NQuads()
	// Spin a goroutine to push NQuads to mutation channel.
	go func() {
		defer wg.Done()
		buffer := make([]*api.NQuad, 0, opt.bufferSize*opt.batchSize)

		drain := func() {
			// We collect opt.bufferSize requests and preprocess them. For the requests
			// to not confict between themself, we sort them on the basis of their predicates.
			// Predicates with count index will conflict among themselves, so we keep them at
			// end, making room for other predicates to load quickly.
			sort.Slice(buffer, func(i, j int) bool {
				iPred := sch.preds[buffer[i].Predicate]
				jPred := sch.preds[buffer[j].Predicate]
				t := func(a *predicate) int {
					if a != nil && a.Count {
						return 1
					}
					return 0
				}

				// Sorts the nquads on basis of their predicates, while keeping the
				// predicates with count index later than those without it.
				if t(iPred) != t(jPred) {
					return t(iPred) < t(jPred)
				}
				return buffer[i].Predicate < buffer[j].Predicate
			})
			for len(buffer) > 0 {
				sz := opt.batchSize
				if len(buffer) < opt.batchSize {
					sz = len(buffer)
				}

				mu := &request{Mutation: &api.Mutation{Set: buffer[:sz]}}
				if opt.upsert != "" {
					l.addUpsert(mu)
				}

				l.reqs <- mu
				buffer = buffer[sz:]
			}
		}

		for nqs := range nqbuf.Ch() {
			if len(nqs) == 0 {
				continue
			}

			if opt.upsert == "" {
				l.allocateUids(nqs)
				for _, nq := range nqs {
					nq.Subject = l.uid(nq.Subject)
					if len(nq.ObjectId) > 0 {
						nq.ObjectId = l.uid(nq.ObjectId)
					}
				}
			}

			buffer = append(buffer, nqs...)
			if len(buffer) < opt.bufferSize*opt.batchSize {
				continue
			}

			drain()
		}
		drain()
	}()

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		default:
		}

		chunkBuf, err := ck.Chunk(rd)
		// Parses the rdf entries from the chunk, groups them into batches (each one
		// containing opt.batchSize entries) and sends the batches to the loader.reqs channel (see
		// above).
		if oerr := ck.Parse(chunkBuf); oerr != nil {
			return errors.Wrap(oerr, "During parsing chunk in processLoadFile")
		}
		if err == io.EOF {
			break
		} else {
			x.Check(err)
		}
	}
	nqbuf.Flush()
	wg.Wait()

	return nil
}

func setup(opts batchMutationOptions, dc *dgo.Dgraph) *loader {
	var db *badger.DB
	if len(opt.clientDir) > 0 {
		x.Check(os.MkdirAll(opt.clientDir, 0700))

		var err error
		db, err = badger.Open(badger.DefaultOptions(opt.clientDir).
			WithTableLoadingMode(bopt.MemoryMap).
			WithCompression(bopt.ZSTD).
			WithSyncWrites(false).
			WithLoadBloomsOnOpen(false).
			WithZSTDCompressionLevel(3))
		x.Checkf(err, "Error while creating badger KV posting store")

	}

	// compression with zero server actually makes things worse
	connzero, err := x.SetupConnection(opt.zero, nil, false)
	x.Checkf(err, "Unable to connect to zero, Is it running at %s?", opt.zero)

	alloc := xidmap.New(connzero, db)
	l := &loader{
		opts:      opts,
		dc:        dc,
		start:     time.Now(),
		reqs:      make(chan *request, opts.Pending*2),
		conflicts: make(map[uint64]struct{}),
		alloc:     alloc,
		db:        db,
		zeroconn:  connzero,
	}

	l.requestsWg.Add(opts.Pending)
	for i := 0; i < opts.Pending; i++ {
		go l.makeRequests()
	}

	rand.Seed(time.Now().Unix())
	return l
}

func run() error {
	var err error
	x.PrintVersion()
	opt = options{
		dataFiles:      Live.Conf.GetString("files"),
		dataFormat:     Live.Conf.GetString("format"),
		schemaFile:     Live.Conf.GetString("schema"),
		zero:           Live.Conf.GetString("zero"),
		concurrent:     Live.Conf.GetInt("conc"),
		batchSize:      Live.Conf.GetInt("batch"),
		clientDir:      Live.Conf.GetString("xidmap"),
		authToken:      Live.Conf.GetString("auth_token"),
		useCompression: Live.Conf.GetBool("use_compression"),
		newUids:        Live.Conf.GetBool("new_uids"),
		verbose:        Live.Conf.GetBool("verbose"),
		httpAddr:       Live.Conf.GetString("http"),
		bufferSize:     Live.Conf.GetInt("bufferSize"),
		ludicrousMode:  Live.Conf.GetBool("ludicrous_mode"),
		upsert:         Live.Conf.GetString("upsert"),
	}
	if opt.key, err = enc.ReadKey(Live.Conf); err != nil {
		fmt.Printf("unable to read key %v", err)
		return err
	}
	go func() {
		if err := http.ListenAndServe(opt.httpAddr, nil); err != nil {
			glog.Errorf("Error while starting HTTP server: %+v", err)
		}
	}()
	ctx := context.Background()
	bmOpts := batchMutationOptions{
		Size:          opt.batchSize,
		Pending:       opt.concurrent,
		PrintCounters: true,
		Ctx:           ctx,
		MaxRetries:    math.MaxUint32,
		bufferSize:    opt.bufferSize,
	}

	dg, closeFunc := x.GetDgraphClient(Live.Conf, true)
	defer closeFunc()

	l := setup(bmOpts, dg)
	defer l.zeroconn.Close()

	if opt.upsert == "" {
		l.conflictGenerator = parseUid
	} else {
		l.conflictGenerator = fingerPrintUid
	}

	if len(opt.schemaFile) > 0 {
		err := processSchemaFile(ctx, opt.schemaFile, opt.key, dg)
		if err != nil {
			if err == context.Canceled {
				fmt.Printf("Interrupted while processing schema file %q\n", opt.schemaFile)
				return nil
			}
			fmt.Printf("Error while processing schema file %q: %s\n", opt.schemaFile, err)
			return err
		}
		fmt.Printf("Processed schema file %q\n\n", opt.schemaFile)
	}

	l.schema, err = getSchema(ctx, dg)
	if err != nil {
		fmt.Printf("Error while loading schema from alpha %s\n", err)
		return err
	}

	if opt.dataFiles == "" {
		return errors.New("RDF or JSON file(s) location must be specified")
	}

	filesList := x.FindDataFiles(opt.dataFiles, []string{".rdf", ".rdf.gz", ".json", ".json.gz"})
	totalFiles := len(filesList)
	if totalFiles == 0 {
		return errors.Errorf("No data files found in %s", opt.dataFiles)
	}
	fmt.Printf("Found %d data file(s) to process\n", totalFiles)

	//	x.Check(dgraphClient.NewSyncMarks(filesList))
	errCh := make(chan error, totalFiles)
	for _, file := range filesList {
		file = strings.Trim(file, " \t")
		go func(file string) {
			errCh <- errors.Wrapf(l.processFile(ctx, file, opt.key), file)
		}(file)
	}

	// PrintCounters should be called after schema has been updated.
	if bmOpts.PrintCounters {
		go l.printCounters()
	}

	for i := 0; i < totalFiles; i++ {
		if err := <-errCh; err != nil {
			fmt.Printf("Error while processing data file %s\n", err)
			return err
		}
	}

	close(l.reqs)
	// First we wait for requestsWg, when it is done we know all retry requests have been added
	// to retryRequestsWg. We can't have the same waitgroup as by the time we call Wait, we can't
	// be sure that all retry requests have been added to the waitgroup.
	l.requestsWg.Wait()
	l.retryRequestsWg.Wait()
	c := l.Counter()
	var rate uint64
	if c.Elapsed.Seconds() < 1 {
		rate = c.Nquads
	} else {
		rate = c.Nquads / uint64(c.Elapsed.Seconds())
	}
	// Lets print an empty line, otherwise Interrupted or Number of Mutations overwrites the
	// previous printed line.
	fmt.Printf("%100s\r", "")
	fmt.Printf("Number of TXs run            : %d\n", c.TxnsDone)
	fmt.Printf("Number of N-Quads processed  : %d\n", c.Nquads)
	fmt.Printf("Time spent                   : %v\n", c.Elapsed)
	fmt.Printf("N-Quads processed per second : %d\n", rate)

	if l.db != nil {
		if err := l.alloc.Flush(); err != nil {
			return err
		}
		if err := l.db.Close(); err != nil {
			return err
		}
	}
	return nil
}