[CI] keepalive with Cluster infrastructure health check.

Discussion:

Aneesh Kumar K.V

2002-05-08 09:50:02 UTC

Hi,

Attaching below is the patch that add Cluster Infrastructure (
http://ci-linux.sourceforge.net ) as the health check mechanism for the
keepalive daemons. Please comment on the patch so that i can continue
working on it

-aneesh

diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/Makefile.in keepalived-0.5.7.new/Makefile.in
--- keepalived-0.5.7/Makefile.in Fri May 3 00:33:26 2002
+++ keepalived-0.5.7.new/Makefile.in Wed May 8 21:22:03 2002
@@ -8,6 +8,7 @@
KERNEL := _KRNL_2_$(shell uname -r | cut -d'.' -f2)_
IPVS_FLAG := @IPVS_SUPPORT@
VRRP_FLAG := @VRRP_SUPPORT@
+CI_LINUX := @CI_LINUX@

prefix = @prefix@
exec_prefix = @exec_prefix@
@@ -62,8 +63,12 @@
vrrp_ipsecah.o
endif

+ifeq ($(CI_LINUX),_WITH_CI_LINUX_)
+CI_LINUX_OBJ=check_ci.c
+endif
+
INCLUDE = -I/usr/src/linux/include
-OBJS = $(CORE_OBJS) $(IPVS_OBJS) $(VRRP_OBJS)
+OBJS = $(CORE_OBJS) $(IPVS_OBJS) $(VRRP_OBJS) $(CI_LINUX_OBJS)

.c.o:
$(CC) -o $@ $(CFLAGS) $(IPVSFLAGS) $(INCLUDE) -c $*.c
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/check_ci.c keepalived-0.5.7.new/check_ci.c
--- keepalived-0.5.7/check_ci.c Thu Jan 1 05:30:00 1970
+++ keepalived-0.5.7.new/check_ci.c Wed May 8 02:19:03 2002
@@ -0,0 +1,152 @@
+#include "check_ci.h"
+#include "parser.h"
+#include "check_api.h"
+#include "smtp.h"
+#include "ipwrapper.h"
+
+
+static nodenum_ip_map_t *nodemap;
+
+void dump_ci_check(void *data)
+{
+
+ syslog(LOG_INFO, " Keepalive method = CI_LINUX");
+ syslog(LOG_INFO, " Cluster Infrastructure for Linux");
+}
+
+
+int ci_check_thread(thread *thread)
+{
+ checker *checker;
+ int status ;
+ checker = THREAD_ARG(thread);
+
+ status = nodestatus(*(checker->rs));
+
+ switch(status) {
+
+ case UP:
+ if (!ISALIVE(checker->rs)) {
+ smtp_alert(thread->master, checker->rs
+ , NULL
+ , "UP"
+ , "=> CI-Linux CHECK succeed on service <=\n\n");
+ perform_svr_state(UP, checker->vs, checker->rs);
+ }
+ break;
+ case DOWN:
+ if (ISALIVE(checker->rs)) {
+ smtp_alert(thread->master, checker->rs
+ , NULL
+ , "DOWN"
+ , "=> TCP CHECK failed on service <=\n\n");
+ perform_svr_state(DOWN, checker->vs, checker->rs);
+ }
+ break;
+ default:
+ syslog(LOG_ERR,"Unknown node status \n");
+ }
+
+
+ /* Register the next check */
+ thread_add_timer(thread->master, ci_check_thread
+ , checker
+ , checker->vs->delay_loop);
+ return 0;
+
+}
+
+void ci_get_handler(vector strvec)
+{
+ nodemap = (nodenum_ip_map_t *)malloc(sizeof(nodenum_ip_map_t)*cluster_maxnodes()+1); /* zero is not considered as a valid node */
+ if( initialize_nodemap(nodemap) < 0 );
+
+ /* How do i inform the main line code ? */
+
+ queue_checker(NULL,dump_ci_check,ci_check_thread,NULL);
+}
+
+void install_ci_check_keyword(void)
+{
+ install_keyword("CI-LINUX", &ci_get_handler);
+}
+int initialize_nodemap(nodenum_ip_map_t *nodemap)
+{
+ FILE *fp;
+ char buf[BUFFSIZE];
+ int node_number ;
+
+ if( (fp = fopen(CLUSTERTAB,"r")) == NULL )
+ return -1;
+ while(fscanf(fp,"%s",buf) != EOF ){
+ if( buf[0] == '#' ){
+ if(fscanf (fp, "%[^\n]",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",
+ CLUSTERTAB);
+ bzero(buf,BUFFSIZE);
+ continue;
+ }
+ node_number = atoi(buf);
+ if ( node_number > cluster_maxnodes()) {
+ syslog(LOG_ERR,"Node number greater than MAX node num\n");
+ return -1;
+ }
+ if(fscanf (fp, "%s",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",CLUSTERTAB);
+ nodemap[node_number].addr_ip = inet_addr(buf);
+
+ if(fscanf (fp, "%[^\n]",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",CLUSTERTAB);
+ bzero(buf,BUFFSIZE);
+
+ }
+
+
+ return 1;
+}
+
+clusternode_t address_to_nodenum(uint32_t addr_ip)
+{
+ int i ;
+ int max_nodes = cluster_maxnodes();
+ for( i = 1 ; i<= max_nodes;i++) {
+ if( nodemap[i].addr_ip == addr_ip )
+ return i;
+ }
+ return 0; /* Not a valid node */
+}
+int node_to_address(clusternode_t node, real_server *rs)
+{
+
+ if( node > cluster_maxnodes() ) {
+ syslog(LOG_ERR,"Node number greater than Max node num \n");
+ return -1;
+ }
+
+ rs->addr_ip = nodemap[node].addr_ip;
+/* do I need to fill the rest of the values here ? */
+
+ return 1;
+}
+
+int nodestatus(real_server real)
+{
+
+ int node_num;
+ clusternode_info_t ni;
+
+ if((node_num = address_to_nodenum(real.addr_ip) ) == 0 )
+ return UNKNOWN_NODE;
+ if(clusternode_info(node_num, sizeof(ni), &ni) >= 0) {
+ if (ni.node_state == CLUSTERNODE_UP )
+ return UP;
+ else
+/* I am insterested only in two state either fully up or down */
+ return DOWN;
+ }
+ else {
+ syslog(LOG_ERR,"Error in getting the cluster information \n");
+ }
+ return UNKNOWN_NODE;
+
+}
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/check_ci.h keepalived-0.5.7.new/check_ci.h
--- keepalived-0.5.7/check_ci.h Thu Jan 1 05:30:00 1970
+++ keepalived-0.5.7.new/check_ci.h Wed May 8 21:11:59 2002
@@ -0,0 +1,34 @@
+#ifndef _CI_H
+#define _CI_H
+#include <signal.h>
+#include <pthread.h>
+#include <linux/cluster.h> /* Should change this to cluster.h alone */
+
+#include <syslog.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "data.h"
+
+#define SIGCLUSTER 2
+#define CLUSTERTAB "/etc/clustertab"
+#define BUFFSIZE 100
+#define UP 1
+#define DOWN 2
+#define UNKNOWN_NODE 0
+
+
+typedef struct nodenum_ip_map{
+ uint32_t addr_ip;
+} nodenum_ip_map_t;
+
+extern int initialize_nodemap(nodenum_ip_map_t *nodemap);
+extern int node_to_address(clusternode_t node, real_server *rs);
+extern clusternode_t address_to_nodenum(uint32_t addr_ip);
+extern int nodestatus(real_server real);
+
+
+
+#endif /* _CI_H */
+
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/configure.in keepalived-0.5.7.new/configure.in
--- keepalived-0.5.7/configure.in Fri May 3 01:05:03 2002
+++ keepalived-0.5.7.new/configure.in Wed May 8 23:03:38 2002
@@ -56,6 +56,9 @@
[ --disable-vrrp do not use the VRRP framework])
AC_ARG_ENABLE(debug,
[ --enable-debug compile with debugging flags])
+AC_ARG_WITH(ci_linux,
+ [--with-ci_linux, Compile with Cluster Infrastructure support( defalut no)],
+ ci_linux=$withval,ci_linux="no")

if test "${enable_lvs}" = "no"; then
if test "${enable_vrrp}" = "no"; then
@@ -77,6 +80,10 @@
DFLAGS="-D_DEBUG_"
AC_SUBST(DFLAGS)
fi
+if test "${ci_linux}" = "yes"; then
+CI_LINUX="_WITH_CI_LINUX_"
+AC_SUBST(CI_LINUX)
+fi

AC_SUBST(IPVS_SUPPORT)
AC_SUBST(VRRP_SUPPORT)
@@ -111,6 +118,9 @@
AC_CHECK_LIB(crypto, MD5_Init,,AC_MSG_ERROR([OpenSSL libraries are required]))
AC_CHECK_LIB(ssl, SSL_CTX_new,,AC_MSG_ERROR([OpenSSL libraries are required]))
AC_CHECK_LIB(popt, poptGetContext,,AC_MSG_ERROR([Popt libraries is required]))
+if test "${ci_linux}" = "yes"; then
+AC_CHECK_LIB(cluster,cluster_maxnodes,,AC_MSG_ERROR([libcluster libraries are required]))
+fi

dnl ----[ Create object list ]----
if test "${KERNEL_CODE}" = "2"; then
@@ -127,6 +137,9 @@
!!! OpenSSL is not properly installed on your system. !!!
!!! Can not include OpenSSL headers files. !!!]))
AC_CHECK_HEADERS(fcntl.h sys/ioctl.h sys/time.h syslog.h unistd.h)
+if test "${ci_linux}" = "yes"; then
+AC_CHECK_HEADERS(linux/cluster.h,,AC_MSG_ERROR([linux/cluster.h file not found]))
+fi

dnl ----[ Checks for typedefs, structures, and compiler characteristics ]----
AC_C_CONST

Greg Freemyer

2002-05-08 17:08:52 UTC

Permalink

Annesh,

Note: I intentionally removed the keepalive list from the cc-list

I just looked over your patch and it looks like you did a very good job of keeping the changes isolated from the core keepalived code.

I hope they accept your patch.

Have you or anyone else explained the purpose of the patch and that the SSI-Linux project is considering leveraging their project as a sub-component?

I know the developer of Mosix was not at all happy that the SSI-Linux project leveraged the load-leveler he developed and OpenSourced.

It might be a good idea get conceptual buy-in from the keepalive daemon developers before incorporating their project into SSI.

Maybe one of the Compaq (I mean HP) guys can talk further about that.

BTW: Will you still be Digital India?

Greg Freemyer
Internet Engineer
Deployment and Integration Specialist
Compaq ASE - Tru64
Compaq Master ASE - SAN Architect
The Norcross Group
www.NorcrossGroup.com

Post by Aneesh Kumar K.V
Hi,
Attaching below is the patch that add Cluster Infrastructure (
http://ci-linux.sourceforge.net ) as the health check mechanism for the
keepalive daemons. Please comment on the patch so that i can continue
working on it
-aneesh
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/Makefile.in
keepalived-0.5.7.new/Makefile.in
--- keepalived-0.5.7/Makefile.in Fri May 3 00:33:26 2002
+++ keepalived-0.5.7.new/Makefile.in Wed May 8 21:22:03 2002
@@ -8,6 +8,7 @@
KERNEL := _KRNL_2_$(shell uname -r | cut -d'.' -f2)_
@@ -62,8 +63,12 @@
vrrp_ipsecah.o
endif
+ifeq ($(CI_LINUX),_WITH_CI_LINUX_)
+CI_LINUX_OBJ=check_ci.c
+endif
+
INCLUDE = -I/usr/src/linux/include
-OBJS = $(CORE_OBJS) $(IPVS_OBJS) $(VRRP_OBJS)
+OBJS = $(CORE_OBJS) $(IPVS_OBJS) $(VRRP_OBJS) $(CI_LINUX_OBJS)
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/check_ci.c
keepalived-0.5.7.new/check_ci.c
--- keepalived-0.5.7/check_ci.c Thu Jan 1 05:30:00 1970
+++ keepalived-0.5.7.new/check_ci.c Wed May 8 02:19:03 2002
@@ -0,0 +1,152 @@
+#include "check_ci.h"
+#include "parser.h"
+#include "check_api.h"
+#include "smtp.h"
+#include "ipwrapper.h"
+
+
+static nodenum_ip_map_t *nodemap;
+
+void dump_ci_check(void *data)
+{
+
+ syslog(LOG_INFO, " Keepalive method = CI_LINUX");
+ syslog(LOG_INFO, " Cluster Infrastructure for Linux");
+}
+
+
+int ci_check_thread(thread *thread)
+{
+ checker *checker;
+ int status ;
+ checker = THREAD_ARG(thread);
+
+ status = nodestatus(*(checker->rs));
+
+ switch(status) {
+
+ if (!ISALIVE(checker->rs)) {
+ smtp_alert(thread->master, checker->rs
+ , NULL
+ , "UP"
+ , "=> CI-Linux CHECK succeed on service
<=\n\n");
+ perform_svr_state(UP, checker->vs, checker->rs);
+ }
+ break;
+ if (ISALIVE(checker->rs)) {
+ smtp_alert(thread->master, checker->rs
+ , NULL
+ , "DOWN"
+ , "=> TCP CHECK failed on service
<=\n\n");
+ perform_svr_state(DOWN, checker->vs, checker->rs);
+ }
+ break;
+ syslog(LOG_ERR,"Unknown node status \n");
+ }
+
+
+ /* Register the next check */
+ thread_add_timer(thread->master, ci_check_thread
+ , checker
+ , checker->vs->delay_loop);
+ return 0;
+
+}
+
+void ci_get_handler(vector strvec)
+{
+ nodemap = (nodenum_ip_map_t
*)malloc(sizeof(nodenum_ip_map_t)*cluster_maxnodes()+1); /* zero is not
considered as a valid node */
+ if( initialize_nodemap(nodemap) < 0 );
+
+ /* How do i inform the main line code ? */
+
+ queue_checker(NULL,dump_ci_check,ci_check_thread,NULL);
+}
+
+void install_ci_check_keyword(void)
+{
+ install_keyword("CI-LINUX", &ci_get_handler);
+}
+int initialize_nodemap(nodenum_ip_map_t *nodemap)
+{
+ FILE *fp;
+ char buf[BUFFSIZE];
+ int node_number ;
+
+ if( (fp = fopen(CLUSTERTAB,"r")) == NULL )
+ return -1;
+ while(fscanf(fp,"%s",buf) != EOF ){
+ if( buf[0] == '#' ){
+ if(fscanf (fp, "%[^\n]",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",
+ CLUSTERTAB);
+ bzero(buf,BUFFSIZE);
+ continue;
+ }
+ node_number = atoi(buf);
+ if ( node_number > cluster_maxnodes()) {
+ syslog(LOG_ERR,"Node number greater than MAX node num\n");
+ return -1;
+ }
+ if(fscanf (fp, "%s",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",CLUSTERTAB);
+ nodemap[node_number].addr_ip = inet_addr(buf);
+
+ if(fscanf (fp, "%[^\n]",buf) == EOF )
+ syslog(LOG_ERR," %s File Format Error \n",CLUSTERTAB);
+ bzero(buf,BUFFSIZE);
+
+ }
+
+
+ return 1;
+}
+
+clusternode_t address_to_nodenum(uint32_t addr_ip)
+{
+ int i ;
+ int max_nodes = cluster_maxnodes();
+ for( i = 1 ; i<= max_nodes;i++) {
+ if( nodemap[i].addr_ip == addr_ip )
+ return i;
+ }
+ return 0; /* Not a valid node */
+}
+int node_to_address(clusternode_t node, real_server *rs)
+{
+
+ if( node > cluster_maxnodes() ) {
+ syslog(LOG_ERR,"Node number greater than Max node num \n");
+ return -1;
+ }
+
+ rs->addr_ip = nodemap[node].addr_ip;
+/* do I need to fill the rest of the values here ? */
+
+ return 1;
+}
+
+int nodestatus(real_server real)
+{
+
+ int node_num;
+ clusternode_info_t ni;
+
+ if((node_num = address_to_nodenum(real.addr_ip) ) == 0 )
+ return UNKNOWN_NODE;
+ if(clusternode_info(node_num, sizeof(ni), &ni) >= 0) {
+ if (ni.node_state == CLUSTERNODE_UP )
+ return UP;
+ else
+/* I am insterested only in two state either fully up or down */
+ return DOWN;
+ }
+ else {
+ syslog(LOG_ERR,"Error in getting the cluster information
\n");
+ }
+ return UNKNOWN_NODE;
+
+}
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/check_ci.h
keepalived-0.5.7.new/check_ci.h
--- keepalived-0.5.7/check_ci.h Thu Jan 1 05:30:00 1970
+++ keepalived-0.5.7.new/check_ci.h Wed May 8 21:11:59 2002
@@ -0,0 +1,34 @@
+#ifndef _CI_H
+#define _CI_H
+#include <signal.h>
+#include <pthread.h>
+#include <linux/cluster.h> /* Should change this to cluster.h alone */
+
+#include <syslog.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "data.h"
+
+#define SIGCLUSTER 2
+#define CLUSTERTAB "/etc/clustertab"
+#define BUFFSIZE 100
+#define UP 1
+#define DOWN 2
+#define UNKNOWN_NODE 0
+
+
+typedef struct nodenum_ip_map{
+ uint32_t addr_ip;
+} nodenum_ip_map_t;
+
+extern int initialize_nodemap(nodenum_ip_map_t *nodemap);
+extern int node_to_address(clusternode_t node, real_server *rs);
+extern clusternode_t address_to_nodenum(uint32_t addr_ip);
+extern int nodestatus(real_server real);
+
+
+
+#endif /* _CI_H */
+
diff -Nru --exclude-from=/tmp/ex.file keepalived-0.5.7/configure.in
keepalived-0.5.7.new/configure.in
--- keepalived-0.5.7/configure.in Fri May 3 01:05:03 2002
+++ keepalived-0.5.7.new/configure.in Wed May 8 23:03:38 2002
@@ -56,6 +56,9 @@
[ --disable-vrrp do not use the VRRP framework])
AC_ARG_ENABLE(debug,
[ --enable-debug compile with debugging flags])
+AC_ARG_WITH(ci_linux,
+ [--with-ci_linux, Compile with Cluster Infrastructure support( defalut
no)],
+ ci_linux=$withval,ci_linux="no")
if test "${enable_lvs}" = "no"; then
if test "${enable_vrrp}" = "no"; then
@@ -77,6 +80,10 @@
DFLAGS="-D_DEBUG_"
AC_SUBST(DFLAGS)
fi
+if test "${ci_linux}" = "yes"; then
+CI_LINUX="_WITH_CI_LINUX_"
+AC_SUBST(CI_LINUX)
+fi
AC_SUBST(IPVS_SUPPORT)
AC_SUBST(VRRP_SUPPORT)
@@ -111,6 +118,9 @@
AC_CHECK_LIB(crypto, MD5_Init,,AC_MSG_ERROR([OpenSSL libraries are
required]))
AC_CHECK_LIB(ssl, SSL_CTX_new,,AC_MSG_ERROR([OpenSSL libraries are
required]))
AC_CHECK_LIB(popt, poptGetContext,,AC_MSG_ERROR([Popt libraries is
required]))
+if test "${ci_linux}" = "yes"; then
+AC_CHECK_LIB(cluster,cluster_maxnodes,,AC_MSG_ERROR([libcluster libraries
are required]))
+fi
dnl ----[ Create object list ]----
if test "${KERNEL_CODE}" = "2"; then
@@ -127,6 +137,9 @@
!!! OpenSSL is not properly installed on your system. !!!
!!! Can not include OpenSSL headers files. !!!]))
AC_CHECK_HEADERS(fcntl.h sys/ioctl.h sys/time.h syslog.h unistd.h)
+if test "${ci_linux}" = "yes"; then
+AC_CHECK_HEADERS(linux/cluster.h,,AC_MSG_ERROR([linux/cluster.h file not
found]))
+fi
dnl ----[ Checks for typedefs, structures, and compiler characteristics
]----
AC_C_CONST
_______________________________________________________________
Have big pipes? SourceForge.net is looking for download mirrors. We supply
_______________________________________________
ssic-linux-devel mailing list
https://lists.sourceforge.net/lists/listinfo/ssic-linux-devel

Alexandre CASSEN

2002-05-13 12:39:07 UTC

Permalink

Hi Aneesh,

Post by Aneesh Kumar K.V
Attaching below is the patch that add Cluster Infrastructure (
http://ci-linux.sourceforge.net ) as the health check mechanism for the
keepalive daemons. Please comment on the patch so that i can continue
working on it
--[..snip patch..]--

I have spend a little time reading the CI/SSI docs available on the
projects sites and I have some questions :

If I make some mistake please correct me : According to what I have red on
CI/SSI :

* CI provide low level primitives like ICS & CLMS that are used in SSI
code. CI/SSI hookes/patches are installed on all the cluster node. A node
is represented by its NODENUM that is passed to linux kernel during linux
kernel bootstrap. So CI kernel hooks (ICS & CLMS) permit to localize a
specific node according to its NODENUM.

* SSI use CI primitives (libcluster) to present the collection of cluster
nodes as single system image. This is the real cluster definition, cluster
(I agree) is not only limited to a single virtualization but multiple
system subsystems like FS, network, IPC, process migration, ...

=> This mean that all the cluster nodes must have the same kernel/userspace
enabled functionnalities. This is the point that make me perplexed with the
CI/SSI LVS integration. If I fully understand, using LVS with CI/SSI
introduce the LVS director virtualization based on CI CLUSTER_NODENUM
identification. So we run a LVS director pool. A LVS director submit
schedulers decisions to loadbalance incoming traffic to a specific
realserver. So the CI/SSI provide the CVIP to expose LVS cluster. This is
my first question : Using this CVIP mean that all node consider this IP as
a local address, so what about LVS connection table ? if traffic is
oriented to a specific LVS director the first time and then jump to another
the second time, what about connections that require persistents
connections ? we will probably introduce a loadbalancing breaking point ?

=> LVS code is a routing software, so if we run LVS in a director CI/SSI
virtualized environnement, the LVS connection table must be synced probably
using a hook into the ICS (a ICS channel) to garanty loadbalancing decision
integrity (consistence). Another point is the CVIP takeover, does the CVIP
is a roaming VIP on the cluster ? if not what makes this CVIP takeover on
another cluster node ?

=> Reading the patch, it figures out that CI nodes availability is checked
with "clusternode_info" over a cluster NODENUM. So node availibility is
returned according to your "keepalive" tool that inform and update the
kernel CI cluster node map. This imply that CI/SSI must be installed on all
CI/SSI cluster nodes and LVS too. This introduce the fact that a CI/SSI
cluster is some kind of global virtual cluster including (with the
loadbalancing eyes) LVS directors and realservers... This is my other
perplexed point. To use CI healcheck framework, we need to hook kernel with
CI patch, but we need to patch kernel with LVS code to provide
loadbalancing part. So realservers and LVS director are global part of the
same cluster exposed and a server can be LVS director and realserver
(webserver for example) at a time. This limit the realserver pool to be
linux ? I doesn t fully understand that point ... :/

=> Conserning your patch, you have done all in the right place. My only
inputs on your contributed code are :

1.
+void ci_get_handler(vector strvec)
+{
+ nodemap = (nodenum_ip_map_t *)malloc(sizeof(nodenum_ip_map_t)
*cluster_maxnodes()+1); /* zero is not considered as a valid node */
+ if( initialize_nodemap(nodemap) < 0 );
+
+ /* How do i inform the main line code ? */
+ queue_checker(NULL,dump_ci_check,ci_check_thread,NULL);
+}

the main line code is informed by queue_checker. During bottstrap
Keepalived parse the configuration file and then queue a checker on each
keyword known. After the keepalived.conf has been successfully parsed,
keepalived de-queue each checker running the specific checker registration
function. => this is the code in check_api.c the function
register_checkers_thread().

2.
+int node_to_address(clusternode_t node, real_server *rs)
+{
+
+ if( node > cluster_maxnodes() ) {
+ syslog(LOG_ERR,"Node number greater than Max node num \n");
+ return -1;
+ }
+
+ rs->addr_ip = nodemap[node].addr_ip;
+ /* do I need to fill the rest of the values here ? */
+
+ return 1;
+}
+

real_server structure is filled during keepalived.conf parsing. In our case
the configuration file looks like :

virtual_server 10.10.10.2 80 {
delay_loop 30
lb_algo rr
lb_kind NAT
protocol TCP

real_server 192.168.200.6 80 {
weight 1
CI-LINUX
}

}

so the node_to_address() function should be suppress since realserver
structure is filled into parser.c.

Is there another way to get the node num from the IP address ? some kind of
procfs entry so we can reduce again the code removing initialize_nodemap
function ? Much more, if we can get this info through procfs, is it
possible to create some kind of kernel reflection function that reflect
nodemap status to an internal keepalived daemon nodestatus representation ?

the code I am addressing is :

+int nodestatus(real_server real)
+{
+
+ int node_num;
+ clusternode_info_t ni;
+
+ if((node_num = address_to_nodenum(real.addr_ip) ) == 0 )
+ return UNKNOWN_NODE;
+ if(clusternode_info(node_num, sizeof(ni), &ni) >= 0) {
+ if (ni.node_state == CLUSTERNODE_UP )
+ return UP;
+ else
+ /* I am insterested only in two state either fully up or
down */
+ return DOWN;
+ }
+ else {
+ syslog(LOG_ERR,"Error in getting the cluster information
\n");
+ }
+ return UNKNOWN_NODE;
+
+}

If we can use some kind of kernel reflection this can be done with
something like :

+int nodestatus(real_server rs)
+{
+
+ int status = 0;
+
+ status = CI_NODE_STATE(rs);
+ if (status < 0)
+ return UNKNOWN_NODE;
+ return (status == CLUSTERNODE_UP)?UP:DOWN;
+}

CI_NODE_STATE() a simple C macro returning a daemon ci structure status
flags that is updated by kernel broadcast through a kernel socket to the CI
kernel hooks ? this the design keepalived run with netlink.

Best regards, and thanks for your inputs,
Alexandre

Aneesh Kumar K.V

2002-05-14 03:27:03 UTC

Permalink

Hi,

Post by Alexandre CASSEN
Hi Aneesh,

I have spend a little time reading the CI/SSI docs available on the
If I make some mistake please correct me : According to what I have red on
* CI provide low level primitives like ICS & CLMS that are used in SSI
code. CI/SSI hookes/patches are installed on all the cluster node. A node
is represented by its NODENUM that is passed to linux kernel during linux
kernel bootstrap. So CI kernel hooks (ICS & CLMS) permit to localize a
specific node according to its NODENUM.
* SSI use CI primitives (libcluster) to present the collection of cluster
nodes as single system image. This is the real cluster definition, cluster
(I agree) is not only limited to a single virtualization but multiple
system subsystems like FS, network, IPC, process migration, ...

SSI is also a kernel patch. libcluster is the interface for the user
application in accessing many of the cluster information.

Post by Alexandre CASSEN
=> This mean that all the cluster nodes must have the same kernel/userspace
enabled functionnalities. This is the point that make me perplexed with the
CI/SSI LVS integration. If I fully understand, using LVS with CI/SSI
introduce the LVS director virtualization based on CI CLUSTER_NODENUM
identification. So we run a LVS director pool. A LVS director submit
schedulers decisions to loadbalance incoming traffic to a specific
realserver. So the CI/SSI provide the CVIP to expose LVS cluster. This is
my first question : Using this CVIP mean that all node consider this IP as
a local address, so what about LVS connection table ? if traffic is
oriented to a specific LVS director the first time and then jump to another
the second time, what about connections that require persistents
connections ? we will probably introduce a loadbalancing breaking point ?

You mean what happens when the application migrates to other node ?.If
i understand your point here. I have the server/daemon running at
node1 and when it migrates to node2( as per the decision taken by load
leveler ) and if the server requires persistents connection what will
happen ?

Well In the case of already existing connection which will be doing read
and write( If i understand correctly ) they will be carried out as
remote operation on the socket,which means even though the application
migrates the socket still remain at the first node(node1).( Brian
should be able to tell more about that. Brian ? ).

We haven't decided about what should happen to the main socket (
socket() ) that is bound to the address.

Post by Alexandre CASSEN
=> LVS code is a routing software, so if we run LVS in a director CI/SSI
virtualized environnement, the LVS connection table must be synced probably
using a hook into the ICS (a ICS channel) to garanty loadbalancing decision
integrity (consistence). Another point is the CVIP takeover, does the CVIP
is a roaming VIP on the cluster ? if not what makes this CVIP takeover on
another cluster node ?

right now I am thinking of using a script that will select potential LVS
directors manually and syn the table between them using --sync option
and fialover using VRRP

Post by Alexandre CASSEN
=> Reading the patch, it figures out that CI nodes availability is checked
with "clusternode_info" over a cluster NODENUM. So node availibility is
returned according to your "keepalive" tool that inform and update the
kernel CI cluster node map. This imply that CI/SSI must be installed on all
CI/SSI cluster nodes and LVS too. This introduce the fact that a CI/SSI
cluster is some kind of global virtual cluster including (with the
loadbalancing eyes) LVS directors and realservers... This is my other
perplexed point. To use CI healcheck framework, we need to hook kernel with
CI patch, but we need to patch kernel with LVS code to provide
loadbalancing part. So realservers and LVS director are global part of the
same cluster exposed and a server can be LVS director and realserver
(webserver for example) at a time. This limit the realserver pool to be
linux ? I doesn t fully understand that point ... :/

I was trying to bring a Cluster Wide IP functionality to the SSI cluster
using LVS and VRRP( keepalive ). Also I will be looking into the initial
work done by Kai. That means nothing is decided. We could either use LVS
and VRRP or the code from HP's NSC for SCO unixware.

The patch was intented to make people able to run LVS with keepalive on
a CI cluster.Whether it form a base of Cluster Wide IP for SSI is yet
to be decided.

Post by Alexandre CASSEN
=> Conserning your patch, you have done all in the right place. My only
1.
+void ci_get_handler(vector strvec)
+{
+ nodemap = (nodenum_ip_map_t *)malloc(sizeof(nodenum_ip_map_t)
*cluster_maxnodes()+1); /* zero is not considered as a valid node */
+ if( initialize_nodemap(nodemap) < 0 );
+
+ /* How do i inform the main line code ? */
+ queue_checker(NULL,dump_ci_check,ci_check_thread,NULL);
+}
the main line code is informed by queue_checker. During bottstrap
Keepalived parse the configuration file and then queue a checker on each
keyword known. After the keepalived.conf has been successfully parsed,
keepalived de-queue each checker running the specific checker registration
function. => this is the code in check_api.c the function
register_checkers_thread().

I was actually pointing at the initialize_nodemap function. If it fails
how am i going to inform the main line code. Does a exit() will do it.
Or is there any other particular way i should exit from the thread. If
my initialize_node_map fails then VRRP( I use the term VRRP because
there is another keep alive with SSI. To differentiate i guess for the
time being VRRP is ok ? ) cannot run with CI .

Post by Alexandre CASSEN
2.
+int node_to_address(clusternode_t node, real_server *rs)
+{
+
+ if( node > cluster_maxnodes() ) {
+ syslog(LOG_ERR,"Node number greater than Max node num \n");
+ return -1;
+ }
+
+ rs->addr_ip = nodemap[node].addr_ip;
+ /* do I need to fill the rest of the values here ? */
+
+ return 1;
+}
+
real_server structure is filled during keepalived.conf parsing. In our case
virtual_server 10.10.10.2 80 {
delay_loop 30
lb_algo rr
lb_kind NAT
protocol TCP
real_server 192.168.200.6 80 {
weight 1
CI-LINUX
}
}
so the node_to_address() function should be suppress since realserver
structure is filled into parser.c.
Is there another way to get the node num from the IP address ?

I am not aware of one. But it will be good to have a mapping of node
number with node IP address. For example in a configuration. with node1
having two IP address one for cluster interconnect and other for
external network, it will be good to have an interface like

clusternode_t cluster_node_num( uint32_t addr_ip).

On both the IP it will return me node1.

some kind of

Post by Alexandre CASSEN
procfs entry so we can reduce again the code removing initialize_nodemap
function ? Much more, if we can get this info through procfs, is it
possible to create some kind of kernel reflection function that reflect
nodemap status to an internal keepalived daemon nodestatus representation ?
+int nodestatus(real_server real)
+{
+
+ int node_num;
+ clusternode_info_t ni;
+
+ if((node_num = address_to_nodenum(real.addr_ip) ) == 0 )
+ return UNKNOWN_NODE;
+ if(clusternode_info(node_num, sizeof(ni), &ni) >= 0) {
+ if (ni.node_state == CLUSTERNODE_UP )
+ return UP;
+ else
+ /* I am insterested only in two state either fully up or
down */
+ return DOWN;
+ }
+ else {
+ syslog(LOG_ERR,"Error in getting the cluster information
\n");
+ }
+ return UNKNOWN_NODE;
+
+}
If we can use some kind of kernel reflection this can be done with
+int nodestatus(real_server rs)
+{
+
+ int status = 0;
+
+ status = CI_NODE_STATE(rs);
+ if (status < 0)
+ return UNKNOWN_NODE;
+ return (status == CLUSTERNODE_UP)?UP:DOWN;
+}
CI_NODE_STATE() a simple C macro returning a daemon ci structure status
flags that is updated by kernel broadcast through a kernel socket to the CI
kernel hooks ? this the design keepalived run with netlink.

The CI/SSI also allow an application to know about the node up/down
events by registering for SIGCLUSTER signal. But using SIGCLUSTER
doesn't well fit into the existing VRRP( keepalive) model with IO
multiplexer. Infact first i tried to do it with signal . But later
scraped it because it doesn't fit well.

Post by Alexandre CASSEN
Best regards, and thanks for your inputs,
Alexandre

-aneesh

Brian J. Watson

2002-05-15 18:16:02 UTC

Permalink

Post by Aneesh Kumar K.V
Well In the case of already existing connection which will be doing read
and write( If i understand correctly ) they will be carried out as
remote operation on the socket,which means even though the application
migrates the socket still remain at the first node(node1).( Brian
should be able to tell more about that. Brian ? ).

Yes, processes and their sockets can be on different nodes and still
work. Processes can migrate whereas sockets cannot yet, so it's
important that they be able to work together remotely.

In the currently released SSI code, processes can do file operations
(read, write, poll, ioctl, etc.) on their remote sockets. I'm working on
allowing them to also do remote socket operations (sendmsg, recvmsg,
accept, etc.). This should be available in the next release after 0.6.5.

Hope this clarifies things a bit.

--
Brian Watson | "Now I don't know, but I been told it's
Software Developer | hard to run with the weight of gold,
Open SSI Clustering Project | Other hand I heard it said, it's
Hewlett-Packard Company | just as hard with the weight of lead."
| -Robert Hunter, 1970

mailto:***@hp.com
http://opensource.compaq.com/

Alexandre Cassen

2002-05-14 20:27:03 UTC

Permalink

Bruce,

Post by Bruce Walker
I think there may be some confusion. Hopefully I can
clarify. There was a technology on Unixware called
NonStop Clusters for Unixware (NSC for short) that was SSI.
The networking piece of NSC had two parts - a CVIP (Cluster
Virtual IP) capability and multi-homed host across the cluster
capability. The CVIP capability was quite similar to LVS. I
won't go into all the detail of what the multi-homed capability
was, although I would be happy to another time.
As we are moving various NSC functionality to Linux, we decided to
leverage the LVS activity rather than porting the CVIP
capability we had. The intent is not to have LVS and
CVIP but just LVS+.

aah....!!!! OK, limpid ! this is what was suspecting. LVS+ you mean LVS for
CI/SSI (just kidding)

Post by Bruce Walker
The goal of integrating CI/SSI with LVS was to enhance the CI/SSI
environment, not necessarily enhance all LVS environments. As you
point out, the plan is that the director(s) would be inside the CI/SSI
cluster and would load level connections only to nodes also in the
cluster. Integration involves reconciling the nodenum with IP
addresses and leveraging the nodedown capability of CI/SSI when
servers go down and facilitating a failover of the director
when that node goes down.

OK. here the VRRP and (keepalived) healthcheck derivation to CI/SSI code
with the patch from Aneesh.
Derivating Healthceck to the inside CI/SSI check framework will limit
keepalived CI/SSI integration to the internal CI/SSI available checkers ?
This can be a starting point, I agree.

Post by Bruce Walker
There is also interest in extending LVS in this environment in a couple of
ways. First, we may be able to automatically detect when processes bind()
to a registered VIP port, and thus avoid having to declare where the
servers for a port are. More significantly, however, we are considering
allowing the VIP to be used for outgoing connections.

Ok, I see. Interresting point...

Post by Bruce Walker
Below I tried to clear up some of the possible confusion.

Ok, thanks for your clarifications, I am sync with your CI/SSI conceptual
design, I was confusing with the CVIP since I have red some infos on this
in pdf on the site.

Best regards and thanks for your time,
Alexandre

Alexandre Cassen

2002-05-14 20:28:02 UTC

Permalink

Bruce,

aah....!!!! OK, limpid ! this is what was suspecting. LVS+ you mean LVS for
CI/SSI (just kidding)

Ok, I see. Interresting point...

Post by Bruce Walker
Below I tried to clear up some of the possible confusion.

Bruce Walker

2002-05-14 19:20:07 UTC

Permalink

Alexandre,
I think there may be some confusion. Hopefully I can
clarify. There was a technology on Unixware called
NonStop Clusters for Unixware (NSC for short) that was SSI.
The networking piece of NSC had two parts - a CVIP (Cluster
Virtual IP) capability and multi-homed host across the cluster
capability. The CVIP capability was quite similar to LVS. I
won't go into all the detail of what the multi-homed capability
was, although I would be happy to another time.

As we are moving various NSC functionality to Linux, we decided to
leverage the LVS activity rather than porting the CVIP
capability we had. The intent is not to have LVS and
CVIP but just LVS+.

Doing this has two components - integration with the membership/
nodeup, nodedown capability of CI/CLMS; and extending some of the
LVS concepts (which may or may not be applicable to other LVS
configurations).

The goal of integrating CI/SSI with LVS was to enhance the CI/SSI
environment, not necessarily enhance all LVS environments. As you
point out, the plan is that the director(s) would be inside the CI/SSI
cluster and would load level connections only to nodes also in the
cluster. Integration involves reconciling the nodenum with IP
addresses and leveraging the nodedown capability of CI/SSI when
servers go down and facilitating a failover of the director
when that node goes down.

There is also interest in extending LVS in this environment in a couple of
ways. First, we may be able to automatically detect when processes bind()
to a registered VIP port, and thus avoid having to declare where the
servers for a port are. More significantly, however, we are considering
allowing the VIP to be used for outgoing connections. This of course
requires extensions on all nodes in the CI/SSI cluster. If I understand
it properly, such a capability is closer to what exists in TruClusters.

Below I tried to clear up some of the possible confusion.

Post by Alexandre CASSEN
Hi Aneesh,

Yes, but nodes also have IP addresses and there will be interfaces to
go from nodenum to IP address and back.

Post by Alexandre CASSEN
* SSI use CI primitives (libcluster) to present the collection of cluster
nodes as single system image. This is the real cluster definition, cluster
(I agree) is not only limited to a single virtualization but multiple
system subsystems like FS, network, IPC, process migration, ...

This could be a bit misleading. The SSI is not presented by libcluster
but instead by clusterizing all the base system calls via hooks and extensions
in the Linux kernel. libcluster actually provides the interfaces to
"look-under-the-covers" to see the individual components of the cluster.

Post by Alexandre CASSEN
=> This mean that all the cluster nodes must have the same kernel/userspace
enabled functionnalities. This is the point that make me perplexed with the
CI/SSI LVS integration. If I fully understand, using LVS with CI/SSI
introduce the LVS director virtualization based on CI CLUSTER_NODENUM
identification.
So we run a LVS director pool. A LVS director submit
schedulers decisions to loadbalance incoming traffic to a specific
realserver. So the CI/SSI provide the CVIP to expose LVS cluster. This is
my first question : Using this CVIP mean that all node consider this IP as
a local address, so what about LVS connection table ? if traffic is
oriented to a specific LVS director the first time and then jump to another
the second time, what about connections that require persistents
connections ? we will probably introduce a loadbalancing breaking point ?

I'm not sure if the CVIP vs. LVS discussion above helps with this. Note that
my understanding is that incoming connections will work very much the standard
LVS way, including the special cases for persistent connections. Outgoing
connections using the VIP will have to be registered with the director
so responses can be routed to the correct node/server. The director may
also have be extended to provide a clusterwide port space as well.

While it may be desireable at some point to integrate LVS with ICS, it is
not necessary to get started. As far as facilitating VIP takeover, I think
we should leverage the existing LVS syncd code, although untimately I would
prefer to rebuild the director on failover (by polling the servers) rather
than paying the overhead of replicating on every connect and disconnect.
That is the strategy we used in NSC for failover.

Yes. All the code (subject to dynamic loading) will be the same on all
nodes and any node could be director or server or both and only nodes
that are part of the CI/SSI cluster can participate. That being said,
I don't think it is out of the question to consider other configuration,
like having the director outside the CI/SSI cluster while the various
members of the cluster are each standard LVS servers; or possibly
just have the director in the cluster with servers external.

bruce

***@hp.com, formerly of compaq.com and tandem.com and orginally locus.com