Skip to content

Commit

Permalink
Bug 1949907: Gather summary of PodNetworkConnectivityChecks (#374)
Browse files Browse the repository at this point in the history
* Add permissions for PodNetworkConnectivityChecks

* Gather PodNetworkConnectivityChecks

* Fix PNCC name

* Gather PNCC info

* Vendor PNCC struct

* Turn PNCC info into nested map

* Fix GatherPNCC function description

* Add meaningful PNCC gathering test

* Improve PNCC docs in code

* Update gathered-data.md with PNCC docs

* Add PNCC JSON to sample archive

* Fix PNCC reason/message counter logic
Apparently Go has a nice idiomatic way of doing this task
which is usually quite ugly in most other languages.

* Add version information to PNCC gatherer

* Gather time of last failed PNCC instead of count

* Fix PNCC test comments

* Remove coreClient from PNCC gatherer

* Add detailed note to PNCC gatherer comment

* Update PNCC docs in gathered-data

* Update PNCC in sample archive

* Make PNCC gatherer failable
  • Loading branch information
natiiix authored Apr 24, 2021
1 parent 6433cde commit 0f5d26c
Show file tree
Hide file tree
Showing 14 changed files with 984 additions and 1 deletion.
16 changes: 16 additions & 0 deletions docs/gathered-data.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,22 @@ Response see https://docs.openshift.com/container-platform/4.6/rest_api/workload
* 4.7+


## PNCC

collects a summary of failed PodNetworkConnectivityChecks.
Time of the most recently failed check with each reason and message is recorded.
The checks are requested via a dynamic client and
then unmarshaled into the appropriate structure.

Resource API: podnetworkconnectivitychecks.controlplane.operator.openshift.io/v1alpha1
Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcontrolplane/v1alpha1

* Location in archive: config/podnetworkconnectivitychecks.json
* Id in config: pod_network_connectivity_checks
* Since versions:
* 4.8+


## PodDisruptionBudgets

gathers the cluster's PodDisruptionBudgets.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"TCPConnectError": {
"kubernetes-apiserver-endpoint-master-0: failed to establish a TCP connection to 10.0.89.232:6443: dial tcp 10.0.89.232:6443: connect: connection refused": "2021-04-22T13:33:02+02:00",
"kubernetes-apiserver-service-cluster: failed to establish a TCP connection to 172.30.254.69:443: dial tcp 172.30.254.69:443: connect: connection refused": "2021-04-22T13:33:02+02:00",
"openshift-apiserver-endpoint-master-0: failed to establish a TCP connection to 10.128.0.43:8443: dial tcp 10.128.0.43:8443: connect: connection refused": "2021-04-21T14:09:02+02:00",
"openshift-apiserver-service-cluster: failed to establish a TCP connection to 172.30.140.22:443: dial tcp 172.30.140.22:443: i/o timeout": "2021-04-21T14:09:02+02:00"
}
}
10 changes: 9 additions & 1 deletion manifests/03-clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,15 @@ rules:
verbs:
- get
- list
- watch
- watch
- apiGroups:
- controlplane.operator.openshift.io
resources:
- podnetworkconnectivitychecks
verbs:
- get
- list
- watch

---
apiVersion: rbac.authorization.k8s.io/v1
Expand Down
1 change: 1 addition & 0 deletions pkg/gather/clusterconfig/0_gatherer.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ var gatherFunctions = map[string]gathering{
"sap_pods": failable(GatherSAPPods),
"sap_datahubs": failable(GatherSAPDatahubs),
"olm_operators": failable(GatherOLMOperators),
"pod_network_connectivity_checks": failable(GatherPNCC),
}

// New creates new Gatherer
Expand Down
3 changes: 3 additions & 0 deletions pkg/gather/clusterconfig/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ var (
datahubGroupVersionResource = schema.GroupVersionResource{
Group: "installers.datahub.sap.com", Version: "v1alpha1", Resource: "datahubs",
}
pnccGroupVersionResource = schema.GroupVersionResource{
Group: "controlplane.operator.openshift.io", Version: "v1alpha1", Resource: "podnetworkconnectivitychecks",
}
)

func init() {
Expand Down
84 changes: 84 additions & 0 deletions pkg/gather/clusterconfig/pod_network_connectivity_checks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package clusterconfig

import (
"context"
"encoding/json"
"time"

controlplanev1 "github.com/openshift/api/operatorcontrolplane/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/dynamic"

"github.com/openshift/insights-operator/pkg/record"
)

// GatherPNCC collects a summary of failed PodNetworkConnectivityChecks.
// Time of the most recently failed check with each reason and message is recorded.
// The checks are requested via a dynamic client and
// then unmarshaled into the appropriate structure.
//
// Resource API: podnetworkconnectivitychecks.controlplane.operator.openshift.io/v1alpha1
// Docs for relevant types: https://pkg.go.dev/github.com/openshift/api/operatorcontrolplane/v1alpha1
//
// * Location in archive: config/podnetworkconnectivitychecks.json
// * Id in config: pod_network_connectivity_checks
// * Since versions:
// * 4.8+
func GatherPNCC(g *Gatherer, c chan<- gatherResult) {
gatherDynamicClient, err := dynamic.NewForConfig(g.gatherKubeConfig)
if err != nil {
c <- gatherResult{errors: []error{err}}
return
}

records, errors := gatherPNCC(g.ctx, gatherDynamicClient)
c <- gatherResult{records: records, errors: errors}
}

func getUnsuccessfulChecks(entries []controlplanev1.LogEntry) []controlplanev1.LogEntry {
unsuccesseful := []controlplanev1.LogEntry{}
for _, entry := range entries {
if !entry.Success {
unsuccesseful = append(unsuccesseful, entry)
}
}
return unsuccesseful
}

func gatherPNCC(ctx context.Context, dynamicClient dynamic.Interface) ([]record.Record, []error) {
pnccListUnstruct, err := dynamicClient.Resource(pnccGroupVersionResource).List(ctx, metav1.ListOptions{})
if err != nil {
return nil, []error{err}
}

jsonBytes, err := pnccListUnstruct.MarshalJSON()
if err != nil {
return nil, []error{err}
}

pnccListStruct := controlplanev1.PodNetworkConnectivityCheckList{}
if err := json.Unmarshal(jsonBytes, &pnccListStruct); err != nil {
return nil, []error{err}
}

unsuccessful := []controlplanev1.LogEntry{}
for _, pncc := range pnccListStruct.Items {
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(pncc.Status.Failures)...)
for _, outage := range pncc.Status.Outages {
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(outage.StartLogs)...)
unsuccessful = append(unsuccessful, getUnsuccessfulChecks(outage.EndLogs)...)
}
}

reasons := map[string]map[string]time.Time{}
for _, entry := range unsuccessful {
if _, exists := reasons[entry.Reason]; !exists {
reasons[entry.Reason] = map[string]time.Time{}
}
if oldTime, exists := reasons[entry.Reason][entry.Message]; !exists || entry.Start.After(oldTime) {
reasons[entry.Reason][entry.Message] = entry.Start.Time
}
}

return []record.Record{{Name: "config/podnetworkconnectivitychecks", Item: record.JSONMarshaller{Object: reasons}}}, nil
}
85 changes: 85 additions & 0 deletions pkg/gather/clusterconfig/pod_network_connectivity_checks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package clusterconfig

import (
"context"
"reflect"
"testing"
"time"

"github.com/openshift/insights-operator/pkg/record"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/runtime/serializer/yaml"
dynamicfake "k8s.io/client-go/dynamic/fake"
)

func Test_PNCC(t *testing.T) {
var pnccYAML = `apiVersion: controlplane.operator.openshift.io/v1alpha1
kind: PodNetworkConnectivityCheck
metadata:
name: example-pncc
namespace: example-namespace
status:
failures:
- success: false
reason: TestReason
message: TestMessage
`

pnccClient := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(runtime.NewScheme(), map[schema.GroupVersionResource]string{
pnccGroupVersionResource: "PodNetworkConnectivityChecksList",
})

decUnstructured := yaml.NewDecodingSerializer(unstructured.UnstructuredJSONScheme)
testPNCC := &unstructured.Unstructured{}

_, _, err := decUnstructured.Decode([]byte(pnccYAML), nil, testPNCC)
if err != nil {
t.Fatal("unable to decode PNCC YAML", err)
}

// Check before creating the PNCC.
records, errs := gatherPNCC(context.Background(), pnccClient)
if len(errs) > 0 {
t.Fatalf("unexpected errors in the first run: %#v", errs)
}
if len(records) != 1 {
t.Fatalf("unexpected number or records in the first run: %d", len(records))
}
rec := records[0]
if rec.Name != "config/podnetworkconnectivitychecks" {
t.Fatalf("unexpected name of record in the first run: %q", rec.Name)
}
recItem, ok := rec.Item.(record.JSONMarshaller)
if !ok {
t.Fatalf("unexpected type of record item in the first run: %q", rec.Name)
}
if !reflect.DeepEqual(recItem.Object, map[string]map[string]time.Time{}) {
t.Fatalf("unexpected value of record item in the first run: %#v", recItem)
}

// Create the PNCC resource.
_, _ = pnccClient.Resource(pnccGroupVersionResource).Namespace("example-namespace").Create(context.Background(), testPNCC, metav1.CreateOptions{})

// Check after creating the PNCC.
records, errs = gatherPNCC(context.Background(), pnccClient)
if len(errs) > 0 {
t.Fatalf("unexpected errors in the second run: %#v", errs)
}
if len(records) != 1 {
t.Fatalf("unexpected number or records in the second run: %d", len(records))
}
rec = records[0]
if rec.Name != "config/podnetworkconnectivitychecks" {
t.Fatalf("unexpected name of record in the second run: %q", rec.Name)
}
recItem, ok = rec.Item.(record.JSONMarshaller)
if !ok {
t.Fatalf("unexpected type of record item in the second run: %q", rec.Name)
}
if !reflect.DeepEqual(recItem.Object, map[string]map[string]time.Time{"TestReason": {"TestMessage": time.Time{}}}) {
t.Fatalf("unexpected value of record item in the second run: %#v", recItem)
}
}
Loading

0 comments on commit 0f5d26c

Please sign in to comment.