跳至主要內容

在K8S集群中配置Prometheus监控SQLServer数据库

大约 6 分钟约 1654 字

在K8S集群中配置Prometheus监控SQLServer数据库

1 安装测试的SQLServer数据库

# [root@salt-master-50 ~/sqlserver]# cat mssql-sts.yaml 
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: mssql
spec:
  serviceName: mssql-headless
  replicas: 1
  selector:
    matchLabels:
      app: mssql
  template:
    metadata:
      labels:
        app: mssql
    spec:
      containers:
        - name: sql-server
          image: harbor.leadchina.cn/mssql/server:2019-CU11-ubuntu-20.04
          ports:
            - containerPort: 1433
              name: mssql
              protocol: TCP
          env:
            - name: ACCEPT_EULA
              value: "Y"
            - name: SA_PASSWORD
              value: "Qwerty123456"
#            - name: SA_PASSWORD
#              valueFrom:
#                secretKeyRef:
#                  name: sql-server-secrets
#                  key: sa-password
          volumeMounts:
            - name: sql-server-data
              mountPath: /var/opt/mssql
  volumeClaimTemplates:
    - metadata:
        name: sql-server-data
      spec:
        storageClassName: managed-nfs-storage
        accessModes: ["ReadWriteOnce"]
        resources:
          requests:
            storage: 1Gi

# [root@salt-master-50 ~/sqlserver]# cat mssql-svc.yaml
apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/name: mssql-svc
  name: mssql-svc
  namespace: default
spec:
  ports:
  - name: mssql
    port: 1443
    protocol: TCP
    targetPort: 1433
  selector:
    app: mssql
  sessionAffinity: None
  type: NodePort
  
# 创建SQL Server StatefulSet无状态服务  
#[root@salt-master-50 ~/sqlserver]# cat mssql-sts-headless-svc.yaml
apiVersion: v1
kind: Service
metadata:
  labels:
    name: sqlserver-headless
  name: sqlserver-headless
  namespace: default
spec:
  clusterIP: None
  clusterIPs:
  - None
  internalTrafficPolicy: Cluster
  ipFamilies:
  - IPv4
  ipFamilyPolicy: SingleStack
  ports:
  - name: mssql
    port: 1433
    protocol: TCP
    targetPort: 1433
  selector:
    app: mssql
  sessionAffinity: None
  type: ClusterIP

2 部署SQLServer exporter

[root@salt-master-50 ~/sqlserver]# cat mssql-exporter-deployment.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  namespace: default
  name: mssql-metrics-config
  labels:
    app: mssql-exporter
data:
  sql_standard.collector.yml: |
    # A collector defining standard metrics for Microsoft SQL Server.
    #
    # It is required that the SQL Server user has the following permissions:
    #
    #   GRANT VIEW ANY DEFINITION TO
    #   GRANT VIEW SERVER STATE TO
    #
    collector_name: mssql_standard

    # Similar to global.min_interval, but applies to the queries defined by this collector only.
    #min_interval: 0s

    metrics:
      - metric_name: mssql_local_time_seconds
        type: gauge
        help: 'Local time in seconds since epoch (Unix time).'
        values: [unix_time]
        query: |
          SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time
      - metric_name: mssql_connections
        type: gauge
        help: 'Number of active connections.'
        key_labels:
          - db
        values: [count]
        query: |
          SELECT DB_NAME(sp.dbid) AS db, COUNT(sp.spid) AS count
          FROM sys.sysprocesses sp
          GROUP BY DB_NAME(sp.dbid)
      #
      # Collected from sys.dm_os_performance_counters
      #
      - metric_name: mssql_deadlocks
        type: counter
        help: 'Number of lock requests that resulted in a deadlock.'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Number of Deadlocks/sec' AND instance_name = '_Total'
      - metric_name: mssql_user_errors
        type: counter
        help: 'Number of user errors.'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Errors/sec' AND instance_name = 'User Errors'
      - metric_name: mssql_kill_connection_errors
        type: counter
        help: 'Number of severe errors that caused SQL Server to kill the connection.'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Errors/sec' AND instance_name = 'Kill Connection Errors'
      - metric_name: mssql_page_life_expectancy_seconds
        type: gauge
        help: 'The minimum number of seconds a page will stay in the buffer pool on this node without references.'
        values: [cntr_value]
        query: |
          SELECT top(1) cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Page life expectancy'
      - metric_name: mssql_batch_requests
        type: counter
        help: 'Number of command batches received.'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Batch Requests/sec'
      - metric_name: mssql_log_growths
        type: counter
        help: 'Number of times the transaction log has been expanded, per database.'
        key_labels:
          - db
        values: [cntr_value]
        query: |
          SELECT rtrim(instance_name) AS db, cntr_value
          FROM sys.dm_os_performance_counters WITH (NOLOCK)
          WHERE counter_name = 'Log Growths' AND instance_name <> '_Total'
      - metric_name: mssql_buffer_cache_hit_ratio
        type: gauge
        help: 'Ratio of requests that hit the buffer cache'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters
          WHERE [counter_name] = 'Buffer cache hit ratio'
      - metric_name: mssql_checkpoint_pages_sec
        type: gauge
        help: 'Checkpoint Pages Per Second'
        values: [cntr_value]
        query: |
          SELECT cntr_value
          FROM sys.dm_os_performance_counters
          WHERE [counter_name] = 'Checkpoint pages/sec'
      #
      # Collected from sys.dm_io_virtual_file_stats
      #
      - metric_name: mssql_io_stall_seconds
        type: counter
        help: 'Stall time in seconds per database and I/O operation.'
        key_labels:
          - db
          - type
        value_label: operation
        values:
          - read
          - write
        query_ref: mssql_io_stall
      - metric_name: mssql_io_stall_total_seconds
        type: counter
        help: 'Total stall time in seconds per database.'
        key_labels:
          - db
          - type
        values:
          - io_stall
        query_ref: mssql_io_stall

      #
      # Collected from sys.dm_os_process_memory
      #
      - metric_name: mssql_resident_memory_bytes
        type: gauge
        help: 'SQL Server resident memory size (AKA working set).'
        values: [resident_memory_bytes]
        query_ref: mssql_process_memory

      - metric_name: mssql_virtual_memory_bytes
        type: gauge
        help: 'SQL Server committed virtual memory size.'
        values: [virtual_memory_bytes]
        query_ref: mssql_process_memory

      - metric_name: mssql_memory_utilization_percentage
        type: gauge
        help: 'The percentage of committed memory that is in the working set.'
        values: [memory_utilization_percentage]
        query_ref: mssql_process_memory

      - metric_name: mssql_page_fault_count
        type: counter
        help: 'The number of page faults that were incurred by the SQL Server process.'
        values: [page_fault_count]
        query_ref: mssql_process_memory

      #
      # Collected from sys.dm_os_sys_memory
      #
      - metric_name: mssql_os_memory
        type: gauge
        help: 'OS physical memory, used and available.'
        value_label: 'state'
        values: [used, available]
        query: |
          SELECT
            (total_physical_memory_kb - available_physical_memory_kb) * 1024 AS used,
            available_physical_memory_kb * 1024 AS available
          FROM sys.dm_os_sys_memory
      - metric_name: mssql_os_page_file
        type: gauge
        help: 'OS page file, used and available.'
        value_label: 'state'
        values: [used, available]
        query: |
          SELECT
            (total_page_file_kb - available_page_file_kb) * 1024 AS used,
            available_page_file_kb * 1024 AS available
          FROM sys.dm_os_sys_memory
      - metric_name: mssql_disk_space_used_database_bytes_total
        type: gauge
        help: 'Disk used by database.'
        key_labels:
          - db
          - type_desc
        values: [CurrentSizeB]
        query: | 
          SELECT DB_NAME(database_id) AS db, type_desc, sum(size) * 1024.0 AS CurrentSizeB
          FROM sys.master_files
          WHERE database_id > 0 AND type IN (0,1)
          GROUP BY type_desc, DB_NAME(database_id)
      - metric_name: mssql_disk_space_avaiable_bytes_total
        type: gauge
        help: 'Available space from mount point'
        key_labels:
          - volume_mount_point
        values: [available_bytes]
        query_ref: mssql_disk_space_avaiable_bytes_total
      - metric_name: mssql_disk_space_total_bytes_total
        type: gauge
        help: 'Used space from mount point'
        key_labels:
          - volume_mount_point
        values: [total_bytes]
        query_ref: mssql_disk_space_avaiable_bytes_total
      - metric_name: mssql_table_size_bytes_total
        type: gauge
        help: 'Disk used by table.'
        query_ref: mssql_table_size
        key_labels:
          - db
          - obj_name
        values: [total_space]
      - metric_name: mssql_table_free_size_bytes_total
        type: gauge
        help: 'Disk used by table.'
        query_ref: mssql_table_size
        key_labels:
          - db
          - obj_name
        values: [unused_space]
      - metric_name: mssql_table_used_size_bytes_total
        type: gauge
        help: 'Disk used by table.'
        query_ref: mssql_table_size
        key_labels:
          - db
          - obj_name
        values: [used_space]
      - metric_name: mssql_transactions_total
        type: counter
        help: 'Transactions for each db'
        query_ref: mssql_transactions_total
        key_labels:
          - db
        values: [transactions_total]
    queries:
      - query_name: mssql_disk_space_avaiable_bytes_total
        query: | 
          SELECT sum(available_bytes) available_bytes ,sum(total_bytes) total_bytes, ISNULL(volume_mount_point,'') volume_mount_point
          FROM sys.master_files AS f
          CROSS APPLY sys.dm_os_volume_stats(f.database_id, f.file_id)
          GROUP BY volume_mount_point;
      - query_name: mssql_transactions_total
        query: |
          SELECT instance_name as db, cntr_value as transactions_total 
          FROM sys.dm_os_performance_counters 
          WHERE counter_name = 'Transactions/sec'
      # Populates `mssql_table_size`
      - query_name: mssql_table_size
        query: |
          IF OBJECT_ID('tempdb.dbo.#space') IS NOT NULL
              DROP TABLE #space

          CREATE TABLE #space (
                                  [db_name] SYSNAME
              , obj_name SYSNAME
              , total_pages BIGINT
              , used_pages BIGINT
              , total_rows BIGINT
          )

          DECLARE @SQL NVARCHAR(MAX)

          SELECT @SQL = STUFF((
                                  SELECT '
              USE [' + d.name + ']
              INSERT INTO #space ([db_name], obj_name, total_pages, used_pages, total_rows)
              SELECT DB_NAME(), SCHEMA_NAME(o.[schema_id]) + ''.'' + o.name, t.total_pages, t.used_pages, t.total_rows
              FROM (
                  SELECT
                        i.[object_id]
                      , total_pages = SUM(a.total_pages)
                      , used_pages = SUM(a.used_pages)
                      , total_rows = SUM(CASE WHEN i.index_id IN (0, 1) AND a.[type] = 1 THEN p.[rows] END)
                  FROM sys.indexes i
                  JOIN sys.partitions p ON i.[object_id] = p.[object_id] AND i.index_id = p.index_id
                  JOIN sys.allocation_units a ON p.[partition_id] = a.container_id
                  WHERE i.is_disabled = 0
                      AND i.is_hypothetical = 0
                  GROUP BY i.[object_id]
              ) t
              JOIN sys.objects o ON t.[object_id] = o.[object_id]
              WHERE o.name NOT LIKE ''dt%''
                  AND o.is_ms_shipped = 0
                  AND o.type = ''U''
                  AND o.[object_id] > 255;'
                                  FROM sys.databases d
                                  WHERE d.[state] = 0
                                  FOR XML PATH(''), TYPE).value('.', 'NVARCHAR(MAX)'), 1, 2, '')

          EXEC sys.sp_executesql @SQL

          SELECT
              [db_name] as db
              , obj_name
              , total_space = CAST(total_pages * 1024.0 * 8. AS DECIMAL(18,2))
              , used_space = CAST(used_pages * 1024.0 * 8. AS DECIMAL(18,2))
              , unused_space = CAST((total_pages - used_pages) * 1024.0 * 8. AS DECIMAL(18,2))
          FROM #space
      # Populates `mssql_io_stall` and `mssql_io_stall_total`
      - query_name: mssql_io_stall
        query: |
          SELECT
              type_desc as type,
              cast(DB_Name(a.database_id) as varchar) AS [db],
              sum(io_stall_read_ms) / 1000.0 AS [read],
              sum(io_stall_write_ms) / 1000.0 AS [write],
              sum(io_stall) / 1000.0 AS io_stall
          FROM
              sys.dm_io_virtual_file_stats(null, null) a
                  INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id
                  CROSS APPLY sys.dm_os_volume_stats(a.database_id, a.file_id)
          GROUP BY a.database_id, type_desc
      # Populates `mssql_resident_memory_bytes`, `mssql_virtual_memory_bytes`, `mssql_memory_utilization_percentage` and
      # `mssql_page_fault_count`.
      - query_name: mssql_process_memory
        query: |
          SELECT
            physical_memory_in_use_kb * 1024 AS resident_memory_bytes,
            virtual_address_space_committed_kb * 1024 AS virtual_memory_bytes,
            memory_utilization_percentage,
            page_fault_count
          FROM sys.dm_os_process_memory
---
apiVersion: v1
kind: Secret
metadata:
  name: mssql-exporter-config
  namespace: default
type: Opaque
stringData:
  sql_exporter.yml: | 
    # Global defaults.
    global:
      # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
      scrape_timeout_offset: 500ms
      # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
      min_interval: 0s
      # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
      # as will concurrent scrapes.
      max_connections: 3
      # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
      # always be the same as max_connections.
      max_idle_connections: 3

    # The target to monitor and the collectors to execute on it.
    target:
      # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
      # the schema gets dropped or replaced to match the driver expected DSN format.
      data_source_name: 'sqlserver://sa:Qwerty123456@sqlserver-headless.default.svc:1433'
      
      # Collectors (referenced by name) to execute on the target.
      collectors: [mssql_standard]

    # Collector files specifies a list of globs. One collector definition is read from each matching file.
    collector_files: 
      - "/mssql-metrics-config/*.collector.yml"
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: mssql-exporter
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: mssql-exporter
  template:
    metadata:
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9399"
      labels:
        app: mssql-exporter
    spec:
      containers:
      - name: mssql-exporter
        image: harbor.leadchina.cn/mssql/sql_exporter:v2019
        command: ["/bin/sql_exporter"]
        args: ["-config.file","/mssql-exporter-config/sql_exporter.yml"]
        ports:
        - name: metrics
          containerPort: 9399
          protocol: TCP
        volumeMounts:
          - name: mssql-exporter-config
            mountPath: /mssql-exporter-config/sql_exporter.yml
            subPath: sql_exporter.yml
          - name: mssql-metrics-config
            mountPath: /mssql-metrics-config
      volumes:
        - configMap:
            defaultMode: 420
            name: mssql-metrics-config
          name: mssql-metrics-config
        - name: mssql-exporter-config
          secret:
            secretName: mssql-exporter-config

# 部署mssql-exporter服务暴露metrics
#[root@salt-master-50 ~/sqlserver]# cat mssql-exporter-svc.yaml
apiVersion: v1
kind: Service
metadata:
  labels:
    app: mssql-exporter
  name: mssql-exporter-svc
  namespace: default
spec:
  ports:
  - name: metrics
    port: 9399
    protocol: TCP
    targetPort: 9399
  selector:
    app: mssql-exporter
  sessionAffinity: None
  type: NodePort

3 创建ServiceMonitor对象

# [root@salt-master-50 ~/sqlserver]# cat mssql-exporter-servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  labels:
    prometheus: service
  name: mssql-exporter
  namespace: default
spec:
  endpoints:
  - honorLabels: true
    port: metrics
    scheme: http
  jobLabel: jobLabel
  namespaceSelector:
    matchNames:
    - default
  selector:
    matchLabels:
      app: mssql-exporter

4 prometheus指标浏览器中确认metrics是否正常获取

在浏览器中输入获取metrics的url测试访问metrics是否正常返回结果。

5 grafana中导入MSSQL仪表板或自己创建

参考

https://promcat.io/apps/mssql