/*
 *  $Id$
 *
 *  This file is part of the OpenLink Software Virtuoso Open-Source (VOS)
 *  project.
 *
 *  Copyright (C) 1998-2025 OpenLink Software
 *
 *  This project is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; only version 2 of the License, dated June 1991.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 *
 */

#include "sqlnode.h"
#include "sqlfn.h"
#include "monitor.h"

#ifdef WIN32
#include <Psapi.h>
#include <fileapi.h>
#include <ctype.h>
#include <direct.h>
#else
#include <sys/statvfs.h>
#include <limits.h>
#endif

#if defined(__APPLE__) || defined(WIN32)
#define MEM_RSS_UNITS (1024*1024)		/* These operating systems report value in bytes */
#else
#define MEM_RSS_UNITS (1024)			/* Most modern operating systems report value in kilobytes */
#endif

#define DIMENSION_OF_STATISTICS 60
static monitor_t statistics[DIMENSION_OF_STATISTICS];
static long mon_log_time[PAGE_SZ]; /* 8k log checks */
uint32 mon_max_threads;
int mon_is_inited;
int32 mon_enable = 1;
int mon_max_cpu_pct;
double curr_cpu_pct = 0.0;
unsigned long curr_mem_rss = 0;
unsigned long curr_page_faults = 0;
int64 curr_vm_size = 0;

extern timeout_t time_now;
extern long disk_reads;
extern long read_block_usec;
extern long write_block_usec;
extern int64 read_cum_time;
extern long tc_no_thread_kill_idle;
extern long tc_no_thread_kill_vdb;
extern long tc_no_thread_kill_running;
extern long tws_accept_queued;
extern long tc_read_wait;
extern long tc_write_wait;
extern long tc_cl_keep_alive_timeouts;
extern long tc_cl_deadlocks;
extern long lock_deadlocks;
extern long lock_2r1w_deadlocks;
extern long lock_waits;
extern long lock_wait_msec;
extern long tc_no_mem_for_longer_batch;
extern size_t mp_large_in_use;
extern long tc_part_hash_join;
extern long tc_slow_temp_insert;
extern long tc_slow_temp_lookup;
extern unsigned char byte_logcount[256];
extern int32 enable_qp;
extern int64 mp_mmap_clocks;
static unsigned int monitor_index = 0;
static unsigned int current_inx; /* the last sample in stats */

#ifdef WIN32
static ULARGE_INTEGER lastCPU, lastSysCPU, lastUserCPU;
static int numProcessors;
static HANDLE me;
#endif

dk_hash_t * mon_fs;
int64 get_proc_vm_size ();

dk_hash_t *error_events_ht;
rwlock_t *error_events_lock;

caddr_t
mon_get_mount_point (caddr_t file_name, int log)
{
#ifndef WIN32
  struct stat file_stat;
  struct stat parent_stat;
  char dir_name[PATH_MAX], parent_name[PATH_MAX], *slash;
  caddr_t mount = NULL;

  if (!file_name)
    return NULL;

  if (-1 != stat (file_name, &file_stat))
    {
      switch (file_stat.st_mode & S_IFMT)
        {
          case S_IFCHR:  /* ignore devices e.g. Null device */
          case S_IFBLK:
          case S_IFIFO:
          case S_IFSOCK:
              return NULL;
          default:
              break;
        }
    }

  dir_name[0] = 0;
  strncat_ck(dir_name, file_name, strlen (file_name));
  do
    {
      parent_name[0] = 0;
      strncat_ck (parent_name, dir_name, strlen (dir_name));
      slash = strrchr (parent_name, '/');
      *slash = 0;
      if (0 == strlen (parent_name))
        {
          parent_name[0] = 0;
          strncat_ck (parent_name, "/", 1);
        }
      if (-1 == stat (dir_name, &file_stat))
        {
          if (log)
            log_error ("Can't get status for %s", dir_name);
          goto fail;
        }

      if (!(file_stat.st_mode & S_IFDIR || file_stat.st_mode & S_IFREG))
        {
          if (log)
            log_error ("Not a directory %s", parent_name);
          goto fail;
        }

      if (-1 == stat(parent_name, &parent_stat))
        {
          if (log)
            log_error ("Can't get status for %s", parent_name);
          goto fail;
        }
      dk_free_box (mount);
      mount = box_string (dir_name);
      dir_name[0] = 0;
      strncat_ck (dir_name, parent_name, strlen (parent_name));
    }
  while (strlen (dir_name) > 1 &&
      (file_stat.st_dev == parent_stat.st_dev && ( file_stat.st_dev != parent_stat.st_dev || file_stat.st_ino != parent_stat.st_ino )));
  return mount;
fail:
#endif
  return NULL;
}

char *
mon_get_size_units (char * buf, int len, uint64 size)
{
  double curr_sz = size;
  char units[] = "bKMGT";
  int inx;
  for (inx = 0; inx < sizeof (units); inx ++)
    {
      if ((curr_sz / 1024) < 1)
        {
          if (inx < 3)
            snprintf (buf, len, "%lld%c", (uint64)curr_sz, units[inx]);
          else
            snprintf (buf, len, "%.01f%c", curr_sz, units[inx]);
          break;
        }
      curr_sz = curr_sz / 1024;
    }
  return buf;
}

/* returns total, available, used */
uint64
mon_get_disk_space (char * file, int flag, caddr_t * err)
{
  uint64 res = 0;
#ifdef WIN32
  struct _diskfree_t ds;
  char drive = toupper (file[0]) - '@';
  int64 bs;

  if (0 != _getdiskfree (drive, &ds))
    {
      int eno = errno;
      *err = srv_make_new_error ("42000", "FA112", "Can't stat file '%.1000s', error (%d) : %s", file, eno, strerror (eno));
      return INT64_MAX;
    }
  bs = ds.sectors_per_cluster * ds.bytes_per_sector;
  switch (flag)
    {
      case 1:
          res = ds.total_clusters * bs;
          break;
      case 2:
          res = ds.avail_clusters * bs;
          break;
      default:
          res = 0;
    }
#else
  struct statvfs vfs;

  if (statvfs(file, &vfs) != 0)
    {
      int eno = errno;
      *err = srv_make_new_error ("42000", "FA112", "Can't stat file '%.1000s', error (%d) : %s", file, eno, strerror (eno));
      return INT64_MAX;
    }
  switch (flag)
    {
      case 1:
          res = vfs.f_frsize * vfs.f_blocks;
          break;
      case 2:
          res = vfs.f_frsize * vfs.f_bavail;
          break;
      default:
          res = 0;
    }
#endif
  return res;
}

void
mon_init (void)
{
  id_hash_t * virt_sys_files = wi_inst.wi_files;
  fs_monitor_t * fs;
  caddr_t mount;

  if (!mon_enable)
    return;

  if (!mon_fs)
    mon_fs = hash_table_allocate (11);

  if (!error_events_ht)
    {
      error_events_lock = rwlock_allocate ();
      error_events_ht = hash_table_allocate (31);
    }

#ifdef WIN32
  DO_IDHASH (caddr_t, fpath, caddr_t, fname, virt_sys_files)
    {
      struct _diskfree_t ds;
      char drive = toupper (fpath[0]) - '@';
      int64 bs;
      if (0 != _getdiskfree (drive, &ds))
        continue;
      if (NULL != gethash ((void *)(ptrlong)drive, mon_fs))
        continue;
      bs = ds.sectors_per_cluster * ds.bytes_per_sector;
      fs = dk_alloc (sizeof (fs_monitor_t));
      fs->fm_sid = drive;
      fs->fm_total = ds.total_clusters * bs;
      fs->fm_free = ds.avail_clusters * bs;
      fs->fm_free_pct = (double) fs->fm_free * 100 / (double) fs->fm_total;
      sethash ((void *)(ptrlong) fs->fm_sid, mon_fs, (void *)fs);
    }
  END_DO_IDHASH;
#else
  DO_IDHASH (caddr_t, fpath, caddr_t, fname, virt_sys_files)
    {
      struct statvfs vfs;
      if (statvfs (fpath, &vfs) != 0)
        continue;
      if (NULL != gethash ((void *)(ptrlong)vfs.f_fsid, mon_fs))
        continue;
      if (NULL == (mount = mon_get_mount_point (fpath, 1)))
        continue;
      fs = dk_alloc (sizeof (fs_monitor_t));
      fs->fm_fs = mount;
      fs->fm_sid = vfs.f_fsid;
      fs->fm_total = vfs.f_frsize * vfs.f_blocks;
      fs->fm_free = vfs.f_frsize * vfs.f_bavail;
      fs->fm_free_pct = (double) vfs.f_bavail * 100.0 / (double)(vfs.f_blocks - vfs.f_bfree + vfs.f_bavail);
      sethash ((void *)(ptrlong) fs->fm_sid, mon_fs, (void *)fs);
    }
  END_DO_IDHASH;
#endif

  mon_max_threads = enable_qp;
  mon_max_cpu_pct = 100 * mon_max_threads;
#ifdef WIN32
  {
    SYSTEM_INFO sysInfo;
    FILETIME ftime, fsys, fuser;
    GetSystemInfo (&sysInfo);
    numProcessors = sysInfo.dwNumberOfProcessors;
    GetSystemTimeAsFileTime (&ftime);
    memcpy (&lastCPU, &ftime, sizeof (FILETIME));
    me = GetCurrentProcess ();
    GetProcessTimes (me, &ftime, &ftime, &fsys, &fuser);
    memcpy (&lastSysCPU, &fsys, sizeof (FILETIME));
    memcpy (&lastUserCPU, &fuser, sizeof (FILETIME));
  }
#endif
  mon_is_inited = 1;
}

int
mon_get_next (int n_threads, int n_vdb_threads, int n_lw_threads, const monitor_t* prev, monitor_t *next)
{
  int thr_run = n_threads - n_vdb_threads - n_lw_threads;
  time_msec_t now = get_msec_real_time ();
#ifdef HAVE_GETRUSAGE
  struct rusage ru;
  next->mon_time_now = now;
  next->mon_time_elapsed = now - prev->mon_time_now;
  if (getrusage (RUSAGE_SELF, &ru) != 0)
    return 1;
  next->mon_cpu_time =
      (ru.ru_utime.tv_sec * 1000 + ru.ru_utime.tv_usec / 1000) + (ru.ru_stime.tv_sec * 1000 + ru.ru_stime.tv_usec / 1000);
  curr_cpu_pct = next->mon_cpu_pct =
      (next->mon_cpu_time - prev->mon_cpu_time) / (double) (next->mon_time_now - prev->mon_time_now) * 100;
  curr_page_faults = ru.ru_majflt;
  next->mon_pageflts = ru.ru_majflt - prev->mon_pageflts;
  curr_mem_rss = ru.ru_maxrss / MEM_RSS_UNITS;
#elif defined (WIN32)
  {
    FILETIME ftime, fsys, fuser;
    ULARGE_INTEGER now, sys, user;
    double percent;
      HANDLE hProcess = GetCurrentProcess ();
      PROCESS_MEMORY_COUNTERS pmc;

    GetSystemTimeAsFileTime (&ftime);
    memcpy (&now, &ftime, sizeof (FILETIME));
    GetProcessTimes (me, &ftime, &ftime, &fsys, &fuser);
    memcpy (&sys, &fsys, sizeof (FILETIME));
    memcpy (&user, &fuser, sizeof (FILETIME));
    percent = (sys.QuadPart - lastSysCPU.QuadPart) + (user.QuadPart - lastUserCPU.QuadPart);
    percent /= (now.QuadPart - lastCPU.QuadPart);
    percent /= numProcessors;
    lastCPU = now;
    lastUserCPU = user;
    lastSysCPU = sys;
    curr_cpu_pct = next->mon_cpu_pct = percent * 100;

      if (GetProcessMemoryInfo (hProcess, &pmc, sizeof(pmc)))
	{
	  curr_mem_rss = pmc.WorkingSetSize / MEM_RSS_UNITS;
	  curr_page_faults = pmc.PageFaultCount;
	  next->mon_pageflts = pmc.PageFaultCount - prev->mon_pageflts;
	}
    }
#endif

#ifdef WIN32
  DO_HT (ptrlong, drive, fs_monitor_t *, fs, mon_fs)
    {
      struct _diskfree_t ds;
      int64 bs;

      if (0 != _getdiskfree (drive, &ds))
        continue;
      bs = ds.sectors_per_cluster * ds.bytes_per_sector;
      fs->fm_total = ds.total_clusters * bs;
      fs->fm_free = ds.avail_clusters * bs;
      fs->fm_free_pct = (double) fs->fm_free * 100 / (double) fs->fm_total;
    }
  END_DO_HT;
#else
  DO_HT (ptrlong, id, fs_monitor_t *, fs, mon_fs)
    {
      struct statvfs vfs;

      if (statvfs (fs->fm_fs, &vfs) != 0)
        continue;
      fs->fm_total = vfs.f_frsize * vfs.f_blocks;
      fs->fm_free = vfs.f_frsize * vfs.f_bavail;
      fs->fm_free_pct = (double) vfs.f_bavail * 100.0 / (double)(vfs.f_blocks - vfs.f_bfree + vfs.f_bavail);
    }
  END_DO_HT;
#endif

  /* get VM size */
  curr_vm_size = get_proc_vm_size ();

  /* thread counts */
  next->mon_thr_run = thr_run;
  next->mon_thr = n_threads;
  next->mon_lw_thr = n_lw_threads;
  /* high cpu is when cpu% > 0.7 * min (runnable threads, max threads) */
  next->mon_high_cpu = ((uint32) next->mon_cpu_pct > (70.0 * MIN (thr_run, mon_max_threads)));
  next->mon_disk_reads = disk_reads;
  next->mon_read_block_usec = read_block_usec;
  next->mon_write_block_usec = write_block_usec;
  next->mon_read_cum_time = read_cum_time;
  next->mon_read_pct = (100.0 * (double) (next->mon_read_cum_time - prev->mon_read_cum_time)) / (double) next->mon_time_elapsed;

  next->mon_tc_no_thread_kill_idle = tc_no_thread_kill_idle;
  next->mon_tc_no_thread_kill_vdb = tc_no_thread_kill_vdb;
  next->mon_tc_no_thread_kill_running = tc_no_thread_kill_running;
  next->mon_tws_accept_queued = tws_accept_queued;

  next->mon_tc_read_wait = tc_read_wait;
  next->mon_tc_write_wait = tc_write_wait;
  next->mon_tc_cl_keep_alive_timeouts = tc_cl_keep_alive_timeouts;
  next->mon_tc_cl_deadlocks = tc_cl_deadlocks;
  next->mon_lock_deadlocks = lock_deadlocks;
  next->mon_lock_2r1w_deadlocks = lock_2r1w_deadlocks;
  next->mon_lock_waits = lock_waits;
  next->mon_lock_wait_msec = lock_wait_msec;

  /* memory */
  next->mon_mp_mmap_clocks = mp_mmap_clocks;
  next->mon_tc_no_mem_for_longer_batch = tc_no_mem_for_longer_batch;
  next->mon_mp_large_in_use = mp_large_in_use;
  next->mon_tc_part_hash_join = tc_part_hash_join;
  next->mon_tc_slow_temp_insert = tc_slow_temp_insert;
  next->mon_tc_slow_temp_lookup = tc_slow_temp_lookup;

  return 0;
}


void
mon_update (int n_threads, int n_vdb_threads, int n_lw_threads)
{
  unsigned int monitor_index_previous;
  if (!mon_is_inited || !mon_enable)
    return;
  monitor_index_previous = monitor_index == 0 ? DIMENSION_OF_STATISTICS - 1 : monitor_index - 1;
  if (mon_get_next (n_threads, n_vdb_threads, n_lw_threads, statistics + monitor_index_previous, statistics + monitor_index) == 0)
    {
      current_inx = monitor_index;
      monitor_index++;
      monitor_index %= DIMENSION_OF_STATISTICS;
    }
}

#define CLK_SCALE 2000000LL /* how many rtdsc clocks are 1msec */
#define LOG_INTERVAL_MSEC 120000L
#define MON_LOG "* Monitor: "
#define N_SAMPLES_CK 10
/* #define MON_DEBUG 1 */

#define MON_CK(name, cond, ck) \
static int \
mon_##name##_ck () \
{ \
  monitor_t * c, *p; \
  int i, current; \
  current = current_inx; \
  for (i = 0; i < N_SAMPLES_CK; i++) \
    { \
      c = &(statistics[current]); \
      current = (0 == current ? DIMENSION_OF_STATISTICS - 1 : current - 1); \
      p = &(statistics[current]); \
      if (ck) \
	return 0; \
      if (!cond) \
	return 0; \
    } \
  return 1; \
}

#define MON_LOG_WARNING(log) \
    do { \
      long last_ck = mon_log_time[i]; \
      if (!last_ck || ((now - last_ck) > LOG_INTERVAL_MSEC)) { \
      log_warning (log); \
      mon_log_time[i] = now; \
      } \
    } while (0)

#define CK(cond) if ((++i >= 0) && (cond))

MON_CK(read, (c->mon_read_pct > (2.0 * c->mon_cpu_pct)), (c->mon_cpu_pct <= 0))
MON_CK(locks, (c->mon_lw_thr > (0.7 * c->mon_thr)), (c->mon_thr <= 0))
MON_CK(thr_run, (c->mon_thr_run > (3 * mon_max_threads) && c->mon_high_cpu), 0)
MON_CK(tws, (c->mon_tws_accept_queued > p->mon_tws_accept_queued), 0)
MON_CK(thr, (c->mon_thr_run > mon_max_threads && c->mon_cpu_pct < 70.0), 0)
MON_CK(no_thr_idle, (c->mon_tc_no_thread_kill_idle > p->mon_tc_no_thread_kill_idle), 0)
MON_CK(no_thr_vdb, (c->mon_tc_no_thread_kill_vdb > p->mon_tc_no_thread_kill_vdb), 0)
MON_CK(no_thr_running, (c->mon_tc_no_thread_kill_running > p->mon_tc_no_thread_kill_running), 0)
MON_CK(no_part_hj, (c->mon_tc_part_hash_join > p->mon_tc_part_hash_join), 0)
MON_CK(no_qmem, (c->mon_tc_no_mem_for_longer_batch > p->mon_tc_no_mem_for_longer_batch), 0)

#define DELTA(m) (c->m - p->m)

void
mon_check (void)
{
  monitor_t *c, *p;
  int prev_inx, i = 0;
  time_msec_t now = get_msec_real_time ();
  if (!mon_is_inited)
    return;
  prev_inx = 0 == current_inx ? DIMENSION_OF_STATISTICS - 1 : current_inx - 1;
  c = &(statistics[current_inx]);
  p = &(statistics[prev_inx]);
#if defined(MON_DEBUG)
  fprintf (stderr, "thr# %d cpu: %.02f%% read: %.02f%%\n", c->mon_thr_run, c->mon_cpu_pct, c->mon_read_pct);
#endif
  /* disk */
  CK (main_bufs < DELTA (mon_disk_reads) && !c->mon_high_cpu)
    MON_LOG_WARNING (MON_LOG "High disk read (1)");
  CK (mon_read_ck ())
    MON_LOG_WARNING (MON_LOG "High disk read (2)");

  /* locks */
  CK (mon_locks_ck ())MON_LOG_WARNING (MON_LOG "Many lock waits");
  CK ((DELTA (mon_lock_deadlocks) + DELTA (mon_tc_cl_deadlocks)) > (0.1 * DELTA (mon_lock_waits)))
    MON_LOG_WARNING (MON_LOG "Should read for update because lock escalation from shared to exclusive fails frequently (1)");
  CK (DELTA (mon_lock_2r1w_deadlocks) > (0.1 * DELTA (mon_lock_deadlocks)))
    MON_LOG_WARNING (MON_LOG "Should read for update because lock escalation from shared to exclusive fails frequently (2)");
  CK (((double) DELTA (mon_lock_wait_msec) / (DELTA (mon_lock_waits) + 1)) > 1)
    MON_LOG_WARNING (MON_LOG "Locks are held for a long time");

  /* threads */
  CK (mon_tws_ck ())MON_LOG_WARNING (MON_LOG
      "No Web Server threads avalable, ServerThreads in [HTTP Server] may have to be increased");
  CK (mon_thr_run_ck ())MON_LOG_WARNING (MON_LOG
      "System is under high load. Adding cluster nodes or using more replicated copies may needed");
  CK (mon_thr_ck ())
    MON_LOG_WARNING (MON_LOG "CPU%% is low while there are large numbers of runnable threads");
  CK (mon_no_thr_idle_ck () || mon_no_thr_vdb_ck () || mon_no_thr_running_ck ())
    MON_LOG_WARNING (MON_LOG "There are too few client threads configured");

  /* memory */
  CK (((mp_mmap_clocks / CLK_SCALE) * 1.1) > now)
    MON_LOG_WARNING (MON_LOG "The mp_mmap_clocks over 10%% of real time");
  CK (mon_no_part_hj_ck ())
    MON_LOG_WARNING (MON_LOG "Low hash join space, try to increase HashJoinSpace");
  CK (mon_no_qmem_ck ())
    MON_LOG_WARNING (MON_LOG "Low query memory limit, try to increase MaxQueryMem");
}

int
mon_log_error_event (uint16 sid, uint64 eid, char *error, int max, int critical)
{
  error_event_t *ee;
  int ok = 1;
  static int logging_enabled = 1;
  int id = ((((uint64) sid) << 48) | (0xffffffffffff & eid));

  if (!error_events_ht || max < 1 || !logging_enabled)
    return ok;

  if (error_events_ht->ht_count > MAX_ERROR_EVENTS)
    {
      log_error ("Maximum number of error events reached, stopping error events tracking.");
      logging_enabled = 0;
      return ok;
    }

  rwlock_wrlock (error_events_lock);
  if (NULL != (ee = (error_event_t *) gethash ((void *) (ptrlong) id, error_events_ht)))
    {
      ee->ee_count++;
      if (ee->ee_count == max)
	log_error (error);
      if (ee->ee_count > max)
	ok = 0;
#if 0
      if (ee->ee_count > max && critical)
	process_is_swapping = 1;
#endif
    }
  else
    {
      ee = dk_alloc (sizeof (error_event_t));
      memset (ee, 0, sizeof (error_event_t));
      ee->ee_count = 1;
      ee->ee_sid = sid;
      ee->ee_eid = eid;
      sethash ((void *) (ptrlong) id, error_events_ht, (void *) ee);
    }
  rwlock_unlock (error_events_lock);
  return ok;
}
