kubernetes

kubernetes - nagios 설치

sysman 2021. 5. 24. 17:55

 

 

nagios-core-kubernetes-pod ip : 192.168.200.150

client ip : 192.168.200.121

 

kubernetes nagios 설치

# vi nagios.yml

---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: nagios-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: nagios-core
  template:
    metadata:
      labels:
        app: nagios-core
    spec:
      containers:
        - name: nagios-container
          image: ethnchao/nagios
          ports:
          - containerPort: 30008
          - containerPort: 30010
---
apiVersion: v1
kind: Service
metadata:
  name: nagios-service
spec:
  type: NodePort
  selector:
    app: nagios-core
  ports:
    - port: 80
      targetPort: 80
      nodePort: 30008
      name: http
    - port: 5666
      targetPort: 5666
      nodePort: 30010
      name: nrpe

#kubectl apply -f nagios.yml

# kubectl exec -it nagios-xxx -- /bin/bash

 

client 설치

 

# yum install nrpe nagios-plugins-nrpe

 

플러그인 설치

# wget https://nagios-plugins.org/download/nagios-plugins-2.3.3.tar.gz

#tar -zxf nagios-plugins-2.3.3.tar.gz

#cd nagios-plugins-2.3.3/

#./configure

#make all

#make && make install

 

관련 plugins 설치 확인

[root@test1 ~]# ls /usr/local/nagios/libexec/
check_apt                    check_disk       check_ifoperstatus  check_mrtgtraf    check_nwstat         check_services      check_time          negate
check_asterisk.pl            check_disk_smb   check_ifstatus      check_nagios      check_open_files.pl  check_simap         check_udp           remove_perfdata ....

#systemctl start nrpe

#systemctl status nrpe

 

방화벽 일단 해제

#setenforce 0

#firewall-cmd --zone=public --add-port=5666/tcp --permanent

#firewall-cmd --zone=public --add-port=30010/tcp --permanent
# firewall-cmd --list-all

# firewall-cmd --reload

 

[root@test1 ~]# cat /etc/xinetd.d/nrpe
# default: off
# description: NRPE (Nagios Remote Plugin Executor)
service nrpe
{
    disable         = no
    per_source      = 25
    socket_type     = stream
    port            = 30010
    wait            = no
    user            = nagios
    group           = nagios
    server          = /usr/local/nagios/bin/nrpe
    server_args     = -c /usr/local/nagios/etc/nrpe.cfg --inetd
    only_from       = 127.0.0.1 192.168.200.121 192.168.200.150
    log_on_success  =
}

 

#vi /etc/services

nrpe    30010/tcp  # Nagios services

 

[root@test1 ~]# systemctl status xinetd.service
* xinetd.service - Xinetd A Powerful Replacement For Inetd
   Loaded: loaded (/usr/lib/systemd/system/xinetd.service; enabled; vendor preset: enabled)
   Active: active (running) since Mon 2021-05-24 18:06:03 KST; 8min ago
     Docs: man:xinetd
           man:xinetd.conf
           man:xinetd.log
  Process: 1014621 ExecStart=/usr/sbin/xinetd -stayalive -pidfile /var/run/xinetd.pid (code=exited, status=0/SUCCES>
 Main PID: 1014622 (xinetd)
    Tasks: 1 (limit: 11812)
   Memory: 1.4M
   CGroup: /system.slice/xinetd.service
           `-1014622 /usr/sbin/xinetd -stayalive -pidfile /var/run/xinetd.pid

May 24 18:06:03 test1.example.com xinetd[1014622]: removing daytime
May 24 18:06:03 test1.example.com xinetd[1014622]: removing discard
May 24 18:06:03 test1.example.com xinetd[1014622]: removing discard
May 24 18:06:03 test1.example.com xinetd[1014622]: removing echo
May 24 18:06:03 test1.example.com xinetd[1014622]: removing echo
May 24 18:06:03 test1.example.com xinetd[1014622]: removing tcpmux
May 24 18:06:03 test1.example.com xinetd[1014622]: removing time
May 24 18:06:03 test1.example.com xinetd[1014622]: removing time
May 24 18:06:03 test1.example.com xinetd[1014622]: xinetd Version 2.3.15 started with loadavg labeled-networking op>
May 24 18:06:03 test1.example.com xinetd[1014622]: Started working: 1 available service
lines 1-23/23 (END)

 

 

 

만약에 xinetd 데몬과 nrpe 데몬에서 bindin error 날 때에 nrpe 데몬 stop 필요(#systemctl stop nrpe)

 

 

양쪽 서버와 클라이언트 쪽에서 다 확인함.

[root@test1 ~]# /usr/local/nagios/libexec/check_nrpe -H 192.168.200.121 -p 30010 -c check_load
OK - load average: 0.01, 0.06, 0.06|load1=0.010;15.000;30.000;0; load5=0.060;10.000;25.000;0; load15=0.060;5.000;20.000;0;

root@nagios-deployment-6f5fbfd568-pmjl9:/usr/local/nagios/etc/objects# /usr/local/nagios/libexec/check_nrpe -p 30010  -H 192.168.200.121 --command=check_load
OK - load average: 0.28, 0.08, 0.02|load1=0.280;15.000;30.000;0; load5=0.080;10.000;25.000;0; load15=0.020;5.000;20.000;0;

 

# vi /usr/local/nagios/etc/nrpe.cfg

server_port=30010

allowed_hosts=127.0.0.1,192.168.200.121,192.168.200.150   //client, nagios-core-server ip 등록

 

주석제거

command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_hda1]=/usr/local/nagios/libexec/check_disk -w 20% -c 10% -p /dev/hda1
command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z
command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200

서버에서 해당 명령어로 전체 명령어를 output으로 가져옴

 

 

[root@test1 ~]# systemctl status nrpe
* nrpe.service - Nagios Remote Plugin Executor
   Loaded: loaded (/usr/lib/systemd/system/nrpe.service; disabled; vendor preset: disabled)
   Active: active (running) since Mon 2021-05-24 11:05:06 KST; 4h 26min ago
     Docs: http://www.nagios.org/documentation
  Process: 859219 ExecStopPost=/bin/rm -f /run/nrpe/nrpe.pid (code=exited, status=0/SUCCESS)
 Main PID: 859223 (nrpe)
    Tasks: 1 (limit: 11812)
   Memory: 1.0M
   CGroup: /system.slice/nrpe.service
           `-859223 /usr/sbin/nrpe -c /etc/nagios/nrpe.cfg -f

May 24 11:05:06 test1.example.com systemd[1]: nrpe.service: Succeeded.
May 24 11:05:06 test1.example.com systemd[1]: Stopped Nagios Remote Plugin Executor.
May 24 11:05:06 test1.example.com systemd[1]: Started Nagios Remote Plugin Executor.
May 24 11:05:06 test1.example.com nrpe[859223]: Starting up daemon
May 24 11:05:06 test1.example.com nrpe[859223]: Server listening on 0.0.0.0 port 30010.
May 24 11:05:06 test1.example.com nrpe[859223]: Server listening on :: port 30010.
May 24 11:05:06 test1.example.com nrpe[859223]: Listening for connections on port 30010
May 24 11:05:06 test1.example.com nrpe[859223]: Allowing connections from: 127.0.0.1,192.168.200.150

 

포트 오픈 확인

#netstat -nlp | grep 30010

 

변환

[root@test1 ~]# vi /etc/services 
nrpe            30010/tcp                # Nagios Remote Plugin Executor
nrpe    30010/tcp

 

[root@test1 ~]# netstat -at | grep nrpe
tcp        0      0 0.0.0.0:nrpe            0.0.0.0:*               LISTEN
tcp6       0      0 [::]:nrpe               [::]:*                  LISTEN

 

<nagios-core-server kubernetes pod에서 확인>

정상

root@nagios-...:/usr/local/nagios/etc/objects# /usr/local/nagios/libexec/check_nrpe -H 192.168.200.121 -p 30010
NRPE v4.0.3

 

아래 내역은 에러

connection refused by host, could not

complete ssl handshake

 

 

##########################################################

######################################################

 

<nagios-core-server-kubernetes-pod 에서 설치>

plugin 확인

root@nagios-deployment-6f5fbfd568-pmjl9:/usr/local/nagios/etc/objects# ls /usr/local/nagios/libexec/
check_apt      check_disk_smb  check_http          check_jabber    check_mysql_query  check_ntp_peer  check_procs    check_spop   check_ups     utils.pm

check_breeze   check_dummy     check_icmp          check_load      check_nagios       check_ntp_time  check_real     check_ssh    check_uptime  utils.sh ...

 

 

 

root@nagios-deployment-6f5fbfd568-pmjl9:/usr/local/nagios/etc/objects# /usr/local/nagios/libexec/check_nrpe -H 192.168.200.121 -p 30010
NRPE v4.0.3

 

local config 파일 복사해서 client config 파일 작성

# cd /usr/local/nagios/etc/objects/

#cp localhost.cfg client.cfg

#vim client.cfg

# Define a host for the local machine
각 호스트네임, address 변경
define host{
        use                     linux-server            ; Name of host template to use
                                                        ; This host definition will inherit all variables that are defined
                                                        ; in (or inherited by) the linux-server host template definition.
        host_name               test1.example.com
        alias                   test1
        address                 192.168.200.121
        }

# Define an optional hostgroup for Linux machines

 

여기 주석처리
#define hostgroup{
#        hostgroup_name  linux-servers ; The name of the hostgroup
#        alias           Linux Servers ; Long name of the group
#        members         localhost     ; Comma separated list of hosts that belong to this group
 #       }

# Define a service to "ping" the local machine

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             PING
        check_command                   check_ping!100.0,20%!500.0,60%
        _graphitepostfix              ping
        }


# Define a service to check the disk space of the root partition
# on the local machine.  Warning if < 20% free, critical if
# < 10% free space on partition.

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             Root Partition
        check_command                   check_local_disk!20%!10%!/
        }



# Define a service to check the number of currently logged in
# users on the local machine.  Warning if > 20 users, critical
# if > 50 users.

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             Current Users
        check_command                   check_local_users!20!50
        }


# Define a service to check the number of currently running procs
# on the local machine.  Warning if > 250 processes, critical if
# > 400 processes.

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             Total Processes
        check_command                   check_local_procs!250!400!RSZDT
        }



# Define a service to check the load on the local machine.

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             Current Load
        check_command                   check_local_load!5.0,4.0,3.0!10.0,6.0,4.0
        _graphitepostfix              loadaverage
        }



# Define a service to check the swap usage the local machine.
# Critical if less than 10% of swap is free, warning if less than 20% is free

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             Swap Usage
        check_command                   check_local_swap!20!10
        }



# Define a service to check SSH on the local machine.
# Disable notifications for this service by default, as not all users may have SSH enabled.




# Define a service to check HTTP on the local machine.
# Disable notifications for this service by default, as not all users may have HTTP enabled.

define service{
        use                             generic-service         ; Name of service template to use
        host_name                       test1.example.com
        service_description             HTTP
        check_command                   check_http
        notifications_enabled           0
        }

추가

# vim /etc/hosts

192.168.200.121 test1.example.com

 

데몬 restart

#service nagios --full-restart

 

vim /usr/local/nagios/etc/nagios.cfg

cfg_file=/usr/local/nagios/etc/objects/client.cfg

 

 

 

에러는 http 80포트 안쓰거나 swapoff 해서 그렇다.

 

 

스위치 ip도 붙여보라해서 스위치도 붙여봤다.

 

 

참고

https://m.blog.naver.com/jkt0620/100202666331

 

https://m.blog.naver.com/skyhomo/220068815847