Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug 1949907: Gather summary of PodNetworkConnectivityChecks #374

Merged
merged 20 commits into from
Apr 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions docs/gathered-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,22 @@ Response see https://docs.openshift.com/container-platform/4.6/rest_api/workload
* 4.7+


## PNCC

collects a summary of failed PodNetworkConnectivityChecks.
Time of the most recently failed check with each reason and message is recorded.
The checks are requested via a dynamic client and
then unmarshaled into the appropriate structure.
natiiix marked this conversation as resolved.
Show resolved Hide resolved

Resource API: podnetworkconnectivitychecks.controlplane.operator.openshift.io/v1alpha1
Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcontrolplane/v1alpha1

* Location in archive: config/podnetworkconnectivitychecks.json
* Id in config: pod_network_connectivity_checks
* Since versions:
* 4.8+


## PodDisruptionBudgets

gathers the cluster's PodDisruptionBudgets.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"TCPConnectError": {
"kubernetes-apiserver-endpoint-master-0: failed to establish a TCP connection to 10.0.89.232:6443: dial tcp 10.0.89.232:6443: connect: connection refused": "2021-04-22T13:33:02+02:00",
"kubernetes-apiserver-service-cluster: failed to establish a TCP connection to 172.30.254.69:443: dial tcp 172.30.254.69:443: connect: connection refused": "2021-04-22T13:33:02+02:00",
"openshift-apiserver-endpoint-master-0: failed to establish a TCP connection to 10.128.0.43:8443: dial tcp 10.128.0.43:8443: connect: connection refused": "2021-04-21T14:09:02+02:00",
"openshift-apiserver-service-cluster: failed to establish a TCP connection to 172.30.140.22:443: dial tcp 172.30.140.22:443: i/o timeout": "2021-04-21T14:09:02+02:00"
}
}
10 changes: 9 additions & 1 deletion manifests/03-clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,15 @@ rules:
verbs:
- get
- list
- watch
- watch
- apiGroups:
- controlplane.operator.openshift.io
resources:
- podnetworkconnectivitychecks
verbs:
- get
- list
- watch

---
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
1 change: 1 addition & 0 deletions pkg/gather/clusterconfig/0_gatherer.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ var gatherFunctions = map[string]gathering{
"sap_pods": failable(GatherSAPPods),
"sap_datahubs": failable(GatherSAPDatahubs),
"olm_operators": failable(GatherOLMOperators),
"pod_network_connectivity_checks": failable(GatherPNCC),
}

// New creates new Gatherer
Expand Down
3 changes: 3 additions & 0 deletions pkg/gather/clusterconfig/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ var (
datahubGroupVersionResource = schema.GroupVersionResource{
Group: "installers.datahub.sap.com", Version: "v1alpha1", Resource: "datahubs",
}
pnccGroupVersionResource = schema.GroupVersionResource{
Group: "controlplane.operator.openshift.io", Version: "v1alpha1", Resource: "podnetworkconnectivitychecks",
}
)

func init() {
Expand Down
84 changes: 84 additions & 0 deletions pkg/gather/clusterconfig/pod_network_connectivity_checks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package clusterconfig

import (
"context"
"encoding/json"
"time"

controlplanev1 "github.com/openshift/api/operatorcontrolplane/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/dynamic"

"github.com/openshift/insights-operator/pkg/record"
)

// GatherPNCC collects a summary of failed PodNetworkConnectivityChecks.
// Time of the most recently failed check with each reason and message is recorded.
// The checks are requested via a dynamic client and
// then unmarshaled into the appropriate structure.
//
// Resource API: podnetworkconnectivitychecks.controlplane.operator.openshift.io/v1alpha1
// Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcontrolplane/v1alpha1
//
// * Location in archive: config/podnetworkconnectivitychecks.json
// * Id in config: pod_network_connectivity_checks
natiiix marked this conversation as resolved.
Show resolved Hide resolved
// * Since versions:
// * 4.8+
func GatherPNCC(g *Gatherer, c chan<- gatherResult) {
gatherDynamicClient, err := dynamic.NewForConfig(g.gatherKubeConfig)
if err != nil {
c <- gatherResult{errors: []error{err}}
return
}

records, errors := gatherPNCC(g.ctx, gatherDynamicClient)
c <- gatherResult{records: records, errors: errors}
}

func getUnsuccessfulChecks(entries []controlplanev1.LogEntry) []controlplanev1.LogEntry {
unsuccesseful := []controlplanev1.LogEntry{}
for _, entry := range entries {
if !entry.Success {
unsuccesseful = append(unsuccesseful, entry)
}
}
return unsuccesseful
}

func gatherPNCC(ctx context.Context, dynamicClient dynamic.Interface) ([]record.Record, []error) {
pnccListUnstruct, err := dynamicClient.Resource(pnccGroupVersionResource).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, []error{err}
}

jsonBytes, err := pnccListUnstruct.MarshalJSON()
if err != nil {
return nil, []error{err}
}

pnccListStruct := controlplanev1.PodNetworkConnectivityCheckList{}
if err := json.Unmarshal(jsonBytes, &pnccListStruct); err != nil {
return nil, []error{err}
}

unsuccessful := []controlplanev1.LogEntry{}
for _, pncc := range pnccListStruct.Items {
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(pncc.Status.Failures)...)
for _, outage := range pncc.Status.Outages {
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(outage.StartLogs)...)
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(outage.EndLogs)...)
}
}

reasons := map[string]map[string]time.Time{}
for _, entry := range unsuccessful {
if _, exists := reasons[entry.Reason]; !exists {
reasons[entry.Reason] = map[string]time.Time{}
}
if oldTime, exists := reasons[entry.Reason][entry.Message]; !exists || entry.Start.After(oldTime) {
reasons[entry.Reason][entry.Message] = entry.Start.Time
}
}

return []record.Record{{Name: "config/podnetworkconnectivitychecks", Item: record.JSONMarshaller{Object: reasons}}}, nil
}
85 changes: 85 additions & 0 deletions pkg/gather/clusterconfig/pod_network_connectivity_checks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package clusterconfig

import (
"context"
"reflect"
"testing"
"time"

"github.com/openshift/insights-operator/pkg/record"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer/yaml"
dynamicfake "k8s.io/client-go/dynamic/fake"
)

func Test_PNCC(t *testing.T) {
var pnccYAML = `apiVersion: controlplane.operator.openshift.io/v1alpha1
kind: PodNetworkConnectivityCheck
metadata:
name: example-pncc
namespace: example-namespace
status:
failures:
- success: false
reason: TestReason
message: TestMessage
`

pnccClient := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), map[schema.GroupVersionResource]string{
pnccGroupVersionResource: "PodNetworkConnectivityChecksList",
})

decUnstructured := yaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme)
testPNCC := &unstructured.Unstructured{}

_, _, err := decUnstructured.Decode([]byte(pnccYAML), nil, testPNCC)
if err != nil {
t.Fatal("unable to decode PNCC YAML", err)
}

// Check before creating the PNCC.
records, errs := gatherPNCC(context.Background(), pnccClient)
if len(errs) > 0 {
t.Fatalf("unexpected errors in the first run: %#v", errs)
}
if len(records) != 1 {
t.Fatalf("unexpected number or records in the first run: %d", len(records))
}
rec := records[0]
if rec.Name != "config/podnetworkconnectivitychecks" {
t.Fatalf("unexpected name of record in the first run: %q", rec.Name)
}
recItem, ok := rec.Item.(record.JSONMarshaller)
if !ok {
t.Fatalf("unexpected type of record item in the first run: %q", rec.Name)
}
if !reflect.DeepEqual(recItem.Object, map[string]map[string]time.Time{}) {
t.Fatalf("unexpected value of record item in the first run: %#v", recItem)
}

// Create the PNCC resource.
_, _ = pnccClient.Resource(pnccGroupVersionResource).Namespace("example-namespace").Create(context.Background(), testPNCC, metav1.CreateOptions{})

// Check after creating the PNCC.
records, errs = gatherPNCC(context.Background(), pnccClient)
if len(errs) > 0 {
t.Fatalf("unexpected errors in the second run: %#v", errs)
}
if len(records) != 1 {
t.Fatalf("unexpected number or records in the second run: %d", len(records))
}
rec = records[0]
if rec.Name != "config/podnetworkconnectivitychecks" {
t.Fatalf("unexpected name of record in the second run: %q", rec.Name)
}
recItem, ok = rec.Item.(record.JSONMarshaller)
if !ok {
t.Fatalf("unexpected type of record item in the second run: %q", rec.Name)
}
if !reflect.DeepEqual(recItem.Object, map[string]map[string]time.Time{"TestReason": {"TestMessage": time.Time{}}}) {
t.Fatalf("unexpected value of record item in the second run: %#v", recItem)
}
}
Loading