Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[0.9.4.2] After a few days running all HTTP requests start to throw 500 errors #4884

Closed
prawnsalad opened this issue Nov 23, 2015 · 11 comments
Closed
Labels

Comments

@prawnsalad
Copy link

Version: 0.9.4.2
Error log: (many more similar)

goroutine 2798674 [runnable]:
github.com/boltdb/bolt.(*Cursor).nsearch(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:302
github.com/boltdb/bolt.(*Cursor).search(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x2551)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:247 +0x354
github.com/boltdb/bolt.(*Cursor).searchPage(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x7fefb2d3a000)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:298 +0x174
github.com/boltdb/bolt.(*Cursor).search(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0xd3a)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:255 +0x3aa
github.com/boltdb/bolt.(*Cursor).searchPage(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x7fefb4b2c000)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:298 +0x174
github.com/boltdb/bolt.(*Cursor).search(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x2b2c)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:255 +0x3aa
github.com/boltdb/bolt.(*Cursor).searchPage(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x7fefb6367000)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:298 +0x174
github.com/boltdb/bolt.(*Cursor).search(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x4367)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:255 +0x3aa
github.com/boltdb/bolt.(*Cursor).seek(0xc8f97df160, 0xc8fb4d0140, 0x45, 0x50, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, ...)
        /tmp/build/src/github.com/boltdb/bolt/cursor.go:152 +0xda
github.com/boltdb/bolt.(*Bucket).Bucket(0xc8fb4a8ac0, 0xc8fb4d0140, 0x45, 0x50, 0x45)
        /tmp/build/src/github.com/boltdb/bolt/bucket.go:111 +0x180
github.com/influxdb/influxdb/tsdb/engine/bz1.(*Tx).Cursor(0xc8f9541f40, 0xc83b4202d0, 0x45, 0xc82bb31601, 0x0, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/engine/bz1/bz1.go:629 +0x165
github.com/influxdb/influxdb/tsdb.(*SelectMapper).Open.func1(0xc8f96a87e0, 0x0, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/mapper.go:244 +0xcfb
github.com/influxdb/influxdb/tsdb.(*SelectMapper).Open(0xc8f96a87e0, 0x0, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/mapper.go:115 +0x157
github.com/influxdb/influxdb/tsdb.(*SelectExecutor).executeAggregate(0xc8f9682ba0, 0xc8f94a2f60)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/executor.go:404 +0x35c
created by github.com/influxdb/influxdb/tsdb.(*SelectExecutor).Execute
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/executor.go:96 +0xf2

goroutine 2798670 [chan receive]:
github.com/influxdb/influxdb/services/httpd.(*Handler).serveQuery(0xc82008d300, 0x7feff98be810, 0xc8fb2a6dc0, 0xc89e482e00, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:286 +0xc2c
github.com/influxdb/influxdb/services/httpd.(*Handler).(github.com/influxdb/influxdb/services/httpd.serveQuery)-fm(0x7feff98be810, 0xc8fb2a6dc0, 0xc89e482e00, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:98 +0x48
github.com/influxdb/influxdb/services/httpd.authenticate.func1(0x7feff98be810, 0xc8fb2a6dc0, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:673 +0x70
net/http.HandlerFunc.ServeHTTP(0xc820150160, 0x7feff98be810, 0xc8fb2a6dc0, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.gzipFilter.func1(0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:734 +0x2af
net/http.HandlerFunc.ServeHTTP(0xc820150180, 0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.versionHeader.func1(0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:743 +0xc0
net/http.HandlerFunc.ServeHTTP(0xc8201501a0, 0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.cors.func1(0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:776 +0x328
net/http.HandlerFunc.ServeHTTP(0xc8201501c0, 0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.requestID.func1(0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:786 +0x15d
net/http.HandlerFunc.ServeHTTP(0xc8201501e0, 0x7feff98be738, 0xc8fb2a6d80, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.logging.func1(0x7feff98be738, 0xc8fb2a6d60, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:794 +0x10b
net/http.HandlerFunc.ServeHTTP(0xc820150200, 0x7feff98be738, 0xc8fb2a6d60, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/influxdb/influxdb/services/httpd.recovery.func1(0x7feff98be700, 0xc8e7ae8e70, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:804 +0x114
net/http.HandlerFunc.ServeHTTP(0xc820150220, 0x7feff98be700, 0xc8e7ae8e70, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1422 +0x3a
github.com/bmizerany/pat.(*PatternServeMux).ServeHTTP(0xc820022108, 0x7feff98be700, 0xc8e7ae8e70, 0xc89e482e00)
        /tmp/build/src/github.com/bmizerany/pat/mux.go:109 +0x244
github.com/influxdb/influxdb/services/httpd.(*Handler).ServeHTTP(0xc82008d300, 0x7feff98be700, 0xc8e7ae8e70, 0xc89e482e00)
        /tmp/build/src/github.com/influxdb/influxdb/services/httpd/handler.go:172 +0x357
net/http.serverHandler.ServeHTTP(0xc820113ce0, 0x7feff98be700, 0xc8e7ae8e70, 0xc89e482e00)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1862 +0x19e
net/http.(*conn).serve(0xc8e7ae8dc0)
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1361 +0xbee
created by net/http.(*Server).Serve
        /home/philip/.gvm/gos/go1.5/src/net/http/server.go:1910 +0x3f6

goroutine 2798671 [chan receive]:
github.com/influxdb/influxdb/tsdb.(*QueryExecutor).executeSelectStatement(0xc82000aff0, 0x0, 0xc885d72a80, 0xc8fb2aa120, 0x2710, 0x0, 0x0)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/query_executor.go:285 +0xeb
github.com/influxdb/influxdb/tsdb.(*QueryExecutor).ExecuteQuery.func1(0xc8fb2a6e40, 0xc89e482eee, 0xb, 0xc82000aff0, 0xc8fb2aa120, 0x2710)
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/query_executor.go:159 +0x757
created by github.com/influxdb/influxdb/tsdb.(*QueryExecutor).ExecuteQuery
        /tmp/build/src/github.com/influxdb/influxdb/tsdb/query_executor.go:213 +0x9b
@rossmcdonald rossmcdonald changed the title After a few days running all HTTP requests start to throw 500 errors [0.9.4.2] After a few days running all HTTP requests start to throw 500 errors Nov 23, 2015
@rossmcdonald
Copy link
Contributor

@prawnsalad Thanks for opening. Does a service restart resolve the problem? Is there anything else of note going on in the database (or server in general) when the issue occurs?

@corylanou Could this be the deadlock issue you've been working on?

@prawnsalad
Copy link
Author

@rossmcdonald Yes, a service restart gets things working again. Nothing else stands out to mention.. the CPU and memory usage were low. I'm using UDP input sources for writes, under 100 points /second. Had a grafana dashboard open querying every 5s. Ubuntu 14.04.3 LTS.

@beckettsean
Copy link
Contributor

Duplicate of #4870?

@prawnsalad
Copy link
Author

@beckettsean Sounds like it could be - though that issue doesn't mention any errors similar to the ones I mention here.

@JulienChampseix
Copy link

@beckettsean we are having the same symptom on InfluxDB v0.9.4.2 (git: 0.9.4 da1b59e)

Soon we got this symptom errors 500, we need to restart influxdb for have be able to restore service (soon action 'reading WAL files to flush to index' are done which is slow currently)

@rossmcdonald
Copy link
Contributor

@prawnsalad Did you have to do anything to get the stack trace (like explicitly kill the process, etc), or did it just appear in the logs during runtime?

@prawnsalad
Copy link
Author

They had just appeared in the log file at runtime. There was many others similar to the 3 I pasted too that went on for several hours.

@JulienChampseix
Copy link

Find new logs following several 500 error creating panic during this nights (dunno if it's linked to the same issue but symptom are same, having huge of 500 errors before crash) :

[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.4 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c797cae7-92e2-11e5-8abf-000000000000 6.665614ms
[http] 2015/11/24 19:37:21 10.19.1.2 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c797d15a-92e2-11e5-8ac4-000000000000 6.59425ms
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.4 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c797d19b-92e2-11e5-8ac5-000000000000 6.880628ms
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.4 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c797d0c1-92e2-11e5-8ac3-000000000000 7.177007ms
[http] 2015/11/24 19:37:21 10.13.1.6 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c79863a3-92e2-11e5-8ace-000000000000 3.467702ms
[http] 2015/11/24 19:37:21 10.19.1.18 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c7987ff7-92e2-11e5-8ad4-000000000000 2.986277ms
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.4 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c798ee9a-92e2-11e5-8ad6-000000000000 236.703µs
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.17 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c798323f-92e2-11e5-8acd-000000000000 5.895723ms
[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
[http] 2015/11/24 19:37:21 10.19.1.5 - brain [24/Nov/2015:19:37:21 +0000] POST /write?db=brain.applications&p=%5BREDACTED%5D&u=brain HTTP/1.1 500 81 - - c799124a-92e2-11e5-8ad7-000000000000 259.654µs
runtime:objectstart Span weird: p=0xca00000000 k=0x6500000 s.start=0xc9ff716000 s.limit=[write] 2015/11/24 19:37:21 write failed for shard 169 on node 1: engine: write points: write throughput too high. backoff and retry
0xca00416000 s.state=2
fatal error: objectstart: bad pointer in unexpected span

goroutine 56 [running]:
runtime.throw(0xcb2580, 0x2b)
    /home/philip/.gvm/gos/go1.5/src/runtime/panic.go:527 +0x90 fp=0xc8201d1610 sp=0xc8201d15f8
runtime.heapBitsForObject(0xca00000000, 0x0, 0x0, 0xc800000000, 0x7f191eafe700)
    /home/philip/.gvm/gos/go1.5/src/runtime/mbitmap.go:217 +0x287 fp=0xc8201d1648 sp=0xc8201d1610
runtime.scanobject(0xc9480e1cc0, 0xc820026720)
    /home/philip/.gvm/gos/go1.5/src/runtime/mgcmark.go:863 +0x239 fp=0xc8201d1718 sp=0xc8201d1648
runtime.gcDrainUntilPreempt(0xc820026720, 0x7d0)
    /home/philip/.gvm/gos/go1.5/src/runtime/mgcmark.go:726 +0x152 fp=0xc8201d1750 sp=0xc8201d1718
runtime.gcBgMarkWorker(0xc820025500)
    /home/philip/.gvm/gos/go1.5/src/runtime/mgc.go:1328 +0x474 fp=0xc8201d17b8 sp=0xc8201d1750
runtime.goexit()
    /home/philip/.gvm/gos/go1.5/src/runtime/asm_amd64.s:1696 +0x1 fp=0xc8201d17c0 sp=0xc8201d17b8
created by runtime.gcBgMarkStartWorkers
    /home/philip/.gvm/gos/go1.5/src/runtime/mgc.go:1238 +0x93

cc @rossmcdonald @beckettsean @corylanou

@beckettsean
Copy link
Contributor

Believed fixed by #4913, which is available in the just-released version 0.9.5.1.

@JulienChampseix that's a different panic and doesn't look like the deadlock issue. Can you open a new issue with that stack trace?

@JulienChampseix
Copy link

thanks @beckettsean i did here #4655 (but before to go on 0.9.5.X, i would be sure it's fixed on the 0.9.5.1 also)

@beckettsean
Copy link
Contributor

@JulienChampseix we've reverted to building with Go 1.4 for the time being, so any Go 1.5 bugs like #4655 should not be relevant for 0.9.5.1.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

4 participants