[CI] dlm-CI patch (remove the dlmdu.)

Discussion:

Aneesh Kumar K.V

2002-10-02 06:36:49 UTC

Hi,

The patch attached below makes DLM fully kernel based with respect to CI
(http://ci-linux.sf.net ).Please comment about the same.

Status:
The code builds. I don't have machines lying around to test it with the
Cluster.Once i get hold of free machines i will do the same. Meanwhile
people interested can pull the code from opendlm CVS.

Architecture. i386.

NOTE: I am adding the code to opendlm CVS. So heartbeat and rsct people
may also be interested in this patch.

Changes to the generic code:
src/Makefile => the subdir is now passed from the configure.
signal handling in the case of DLM threads.
added new module parameter( haDLM_max_node) which is used only for CI
but is generic.

-aneesh

Index: configure.ac
===================================================================
RCS file: /cvsroot/opendlm/opendlm/configure.ac,v
retrieving revision 1.2
diff -r1.2 configure.ac
56a57,59

AC_ARG_WITH(
ci_linux, [--with-ci_linux use Cluster Infrastructure for Linux],
CLUSTERMGT=ci_linux)

120a124,125

srcsub_dir="make include api kernel user"
AC_SUBST(srcsub_dir)

129a135,136

srcsub_dir="make include api kernel user"
AC_SUBST(srcsub_dir)

135a143,147

ci_linux)
srcsub_dir="make include api kernel "
AC_SUBST(srcsub_dir)
AC_DEFINE(USE_CI_LINUX, , Use CI clustering SW)
;;

140a153
204d216
< src/make/Makefile
Index: src/Makefile.am
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/Makefile.am,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 Makefile.am
4c4
< SUBDIRS = make include api kernel user
---
Index: src/include/dlm_kernel.h
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/include/dlm_kernel.h,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 dlm_kernel.h
238a239,240

#ifndef LINUX_VERSION_CODE
#include <linux/version.h>

243a246

#endif

Index: src/kernel/dlmcccp/cccp_udp.c
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/kernel/dlmcccp/cccp_udp.c,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 cccp_udp.c
28a29,33

*make cccp_poll_thread() smarter about incoming signals.
*/
/*

167a173
170a177,188

/* If we get any signals, other than SIGINT, just flush them and
* restart the socket read.
*/
if ( ((-bytes == EINTR) || (-bytes == ERESTARTSYS))
&& !sigismember(&current->pending.signal, SIGINT)) {
unsigned long _flags;
spin_lock_irqsave(&current->sigmask_lock, _flags);
flush_signals(current);
spin_unlock_irqrestore(&current->sigmask_lock, _flags);
goto recvmsg;
}

Index: src/kernel/dlmdk/clm_main.c
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/kernel/dlmdk/clm_main.c,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 clm_main.c
532a533,534

#ifndef CONFIG_CLMS /* no user space argument for CLMS */

543a546,547

#endif

Index: src/kernel/dlmdk/dlm_kerndd.c
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/kernel/dlmdk/dlm_kerndd.c,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 dlm_kerndd.c
72a73,78

* Added support for kernel cluster manager
* define KERN_CLMGR in the user space and
* and CONFIG_CLMS in the kernel space for Cluster Infrastructure for Linux
*/

104a111,125

#ifdef CONFIG_CLMS
#include <cluster/nsc.h>
#include <cluster/icsgen.h>
#include <linux/inet.h> /* in_aton() */
/* To avoid conflicting type with nsc.h and dlm header files*/
#define BOOL_T_DEFINED
#define MAX_NODE_VALUE NSC_MAX_NODE_VALUE
#endif
#ifndef MAX_NODE_VALUE
#define MAX_NODE_VALUE 10
#endif

117a139,143

#ifdef CONFIG_CLMS
#include "dlm_clust.h"
#include "dlm_version.h"
#endif

135a162,166

long haDLM_max_node = 0; /* Maximum number of nodes in the
* cluster
*/

146a178,179

MODULE_PARM(haDLM_max_node, "l" ); /* long */
MODULE_PARM_DESC(haDLM_max_node, "Maximum number of nodes in the cluster. " );

161a195

#ifndef CONFIG_CLMS /* Write is used by user space cluster manager */

165a200

#endif

169a205,220

#ifdef CONFIG_CLMS
int dlm_cli_nodeup( void *clms_handle,
int service,
clusternode_t node,
clusternode_t surrogate,
void *private) ;
int dlm_cli_nodedown( void *clms_handle,
int service,
clusternode_t node,
clusternode_t surrogate,
void *private) ;
MQ_DLM_TOP_INFO_t * get_upnode_list(void);
int start_dlm_for_ci(void);
#endif

181a233

#ifndef CONFIG_CLMS /* haDLM_write is used by user space cluster manager */

182a235

#endif

338a392,395

if ( haDLM_max_node == 0 ) {
haDLM_max_node = MAX_NODE_VALUE;
}

368a426,441

#ifdef CONFIG_CLMS
if ( (_i = start_dlm_for_ci()) < 0 )
return _i ;
haDLM_allow_locks = 1;
return (register_clms_subsys("dlm",
-1,
dlm_cli_nodeup,
dlm_cli_nodedown,
NULL,
NULL,
NULL));
#endif

785c858
<
---

#ifndef CONFIG_CLMS /* Write is used by user space cluster manager */

930a1004

#endif /* CONFIG_CLMS */

1342a1417,1562

#ifdef CONFIG_CLMS
int dlm_cli_nodeup( void *clms_handle,
int service,
clusternode_t node,
clusternode_t surrogate,
void *private)
{
MQ_DLM_TOP_INFO_t *new_topo;
dlm_workunit_t *wu;
if (!haDLM_allow_locks) {
printk("locks: not yet initialized.\n");
clms_nodeup_callback(clms_handle, service, node);
return 0;
}
new_topo = get_upnode_list();
if (new_topo == NULL)
return(-ENOMEM); /* Check for the return */
if (NULL ==
(wu = kmalloc_something(sizeof(dlm_workunit_t),
"WorkUnit for topology block"))) {
return(-ENOMEM);
}
wu->data = new_topo;
wu->free_data = kfree_topo_block;
wu->type = MQ_DLM_TOP_INFO_MSG;
dlm_workqueue_put_work(dlm_master_work_queue, wu);
clms_nodeup_callback(clms_handle, service, node);
return 0;
}
int dlm_cli_nodedown( void *clms_handle,
int service,
clusternode_t node,
clusternode_t surrogate,
void *private)
{
MQ_DLM_TOP_INFO_t *new_topo;
dlm_workunit_t *wu;
if (!haDLM_allow_locks) {
printk("locks: not yet initialized.\n");
clms_nodedown_callback(clms_handle, service, node);
return 0;
}
new_topo = get_upnode_list();
if (new_topo == NULL)
return(-ENOMEM);
if (NULL ==
(wu = kmalloc_something(sizeof(dlm_workunit_t),
" WorkUnit for topology block"))) {
return(-ENOMEM);
}
wu->data = new_topo;
wu->free_data = kfree_topo_block;
wu->type = MQ_DLM_TOP_INFO_MSG;
dlm_workqueue_put_work(dlm_master_work_queue, wu);
clms_nodedown_callback(clms_handle, service, node);
return 0;
}
/* Getting the list of already up nodes */
MQ_DLM_TOP_INFO_t * get_upnode_list(void)
{
MQ_DLM_TOP_INFO_t *new_topo;
int node ;
int size = (haDLM_max_node * sizeof(MQ_DLM_USEADDR_t)) + sizeof(MQ_DLM_TOP_INFO_t);
if (NULL ==
(new_topo = kmalloc_something(size, "CMGR topology block"))) {
return(NULL);
}
new_topo->msg_version = TAM_version;
new_topo->n_nodes = haDLM_max_node;
new_topo->this_nodeid = this_node;
/* new_topo->event ??? */
for(node= 0 ; node < haDLM_max_node;node++ ) {
new_topo->addrs[node].nodeid = node;
/* You can't use clms_isnodeup here because during NODEUP
* event clms_isnodeup on the particular node doesn't
* return 1
*/
if(clms_isnodedown((clusternode_t)node)) {
new_topo->addrs[node].useaddr.s_addr=0;
new_topo->addrs[node].dlm_major= 0;
new_topo->addrs[node].dlm_minor= 0;
}else {
icsinfo_t nodeinfo;
ics_geticsinfo(node, &nodeinfo);
new_topo->addrs[node].useaddr.s_addr= in_aton( (char*)&nodeinfo);
new_topo->addrs[node].dlm_major= DLM_MAJOR_VERSION;
new_topo->addrs[node].dlm_minor= DLM_MINOR_VERSION;
}
}
return new_topo;
}
int start_dlm_for_ci()
{
MQ_DLM_TOP_INFO_t *topo;
dlm_workunit_t *wu;
printk("[%s] Starting DLM lockd\n", haDLM_name);
kproc_pid = start_lockd(0,0); /* Can use 0 only for CONFIG_CLMS */
if (ERROR == kproc_pid) {
bsdlog(LOG_INFO,
"[%s] cannot start kernel thread.\n",
haDLM_name);
return(-ESRCH);
} else {
bsdlog(LOG_INFO,
"[%s] dlmdk kthread started, pid [%d]\n",
haDLM_name,
kproc_pid);
}
if ((topo = get_upnode_list()) == NULL ) {
return (-ENOMEM);
}
if (NULL ==
(wu = kmalloc_something(sizeof(dlm_workunit_t),
"WorkUnit for topology block"))) {
return(-ENOMEM);
}
wu->data = topo;
wu->free_data = kfree_topo_block;
wu->type = MQ_DLM_TOP_INFO_MSG;
dlm_workqueue_put_work(dlm_master_work_queue, wu);
return 1; /* 1 for success */
}
#endif /* CONFIG_CLMS */

Index: src/kernel/dlmdk/dlm_workqueue.c
===================================================================
RCS file: /cvsroot/opendlm/opendlm/src/kernel/dlmdk/dlm_workqueue.c,v
retrieving revision 1.1.1.1
diff -r1.1.1.1 dlm_workqueue.c
29a30,35

* Educating dlm_workqueue_get_work about SIGCLUSTER
* Thanks to Kai.
*/

109a116
111a119,134

#ifdef CONFIG_CLMS
/* This part of the code should be made more
* generic. Find out which signals are to be considered
* for doing a dlm_shutdown and if those signals
*/
/* HACK !!!! I know SIGCLUSTER is not for shutdown*/
if(sigismember(&current->pending.signal,SIGCLUSTER)) {
unsigned long _flags;
spin_lock_irqsave(&current->sigmask_lock,_flags);
flush_signals(current);
spin_unlock_irqrestore(&current->sigmask_lock,_flags);
goto dlm_get_work;
}
#endif

Brian J. Watson

2002-10-02 19:32:24 UTC

Permalink

Post by Aneesh Kumar K.V
Hi,
The patch attached below makes DLM fully kernel based with respect to CI
(http://ci-linux.sf.net ).Please comment about the same.

This is a meta-issue, but it's better to build a patch with the -u
option to diff. That way it'll include some context, which can be used
to patch files that have been slightly modified already.

--
Brian Watson | "Now I don't know, but I been told it's
Software Developer | hard to run with the weight of gold,
Open SSI Clustering Project | Other hand I heard it said, it's
Hewlett-Packard Company | just as hard with the weight of lead."
| -Robert Hunter, 1970

mailto:***@hp.com
http://opensource.compaq.com/

Greg Freemyer

2002-10-03 02:22:04 UTC

Permalink

Did you see David Chow's message on the OpenGFS list that he had a recent discussion with some IBM engineers and they said that the IBM DLM has pretty excessive interconnect requirements.

Just a few nodes could saturate a 100 Mbit interconnect and that to do a large scale network, myrinet would be required.

Obviously it is nice that IBM's DLM (or OpenDLM) can utilize CI for node up/down monitoring, but it makes one wonder if this is the right locking solution to be a core piece of SSI/CI.

Greg Freemyer

Aneesh Kumar K.V

2002-10-03 02:44:03 UTC

Permalink

Hi,

Yes, What David Chow is more worried about is the separate
communication channel needed for DLM. In the case of CI we already have
the IP based communication channel being used for cluster node
communication and I guess the production system will be running SSI
cluster with high speed switches . But from the opengfs perspective
asking for a expensive switch for just DLM may not be a cost effective
one.

Yes we may need to change DLM to use CI even in node communication. CI
has the concept of throttling that prevents the communication channel
between nodes from getting saturated. ( I guess it is not yet
implemented ). In that way we have a control over DLM messages.

Any how making DLM as the locking solution in SSI/CI is something not
yet decided.So no need to worry about that :)

-aneesh

Post by Greg Freemyer
Did you see David Chow's message on the OpenGFS list that he had a recent discussion with some IBM engineers and they said that the IBM DLM has pretty excessive interconnect requirements.
Just a few nodes could saturate a 100 Mbit interconnect and that to do a large scale network, myrinet would be required.
Obviously it is nice that IBM's DLM (or OpenDLM) can utilize CI for node up/down monitoring, but it makes one wonder if this is the right locking solution to be a core piece of SSI/CI.
Greg Freemyer
-------------------------------------------------------
This sf.net email is sponsored by:ThinkGeek
Welcome to geek heaven.
http://thinkgeek.com/sf
_______________________________________________
ci-linux-devel mailing list
https://lists.sourceforge.net/lists/listinfo/ci-linux-devel

Aneesh Kumar K.V

2002-10-07 04:34:08 UTC

Permalink

Hi,

I have checked in the code for CI integration. The dlmdu daemon is now
not used for CI. Tested the code base on a single node machine. I have
tested the code base by running test case convert_test.sh. Next weekend
i will try to get the code base tested against multiple nodes.Meanwhile
other interested guys can take a look at the code. Patches and bug
reports are welcome :) . Information regarding building and configuring
are in INSTALL.opendlm.

I also request other people working on heartbeat and rsct to also look
at the code.

Which mailing list we should use for DLM related development purpose.
IBM mailing list or open-gfs mailing list. Again is it possible to add a
checkin notification mailing list. There is a similar one for CI/SSI

-aneesh

Brian Jackson

2002-10-10 14:58:04 UTC

Permalink

Sorry it took me so long to respond to this email, I set it aside to
remember to respond to it and of course didn't remember.

I set up an opendlm development mailing list with the OpenDLM project at
sourceforge, and since there will be people other than the OpenGFS people
working on it, it makes since to use it instead of the OpenDLM mailing list.

Post by Aneesh Kumar K.V
Hi,
I have checked in the code for CI integration. The dlmdu daemon is now
not used for CI. Tested the code base on a single node machine. I have
tested the code base by running test case convert_test.sh. Next weekend
i will try to get the code base tested against multiple nodes.Meanwhile
other interested guys can take a look at the code. Patches and bug
reports are welcome :) . Information regarding building and configuring
are in INSTALL.opendlm.
I also request other people working on heartbeat and rsct to also look
at the code.
Which mailing list we should use for DLM related development purpose.
IBM mailing list or open-gfs mailing list. Again is it possible to add a
checkin notification mailing list. There is a similar one for CI/SSI
-aneesh
-------------------------------------------------------
This sf.net email is sponsored by:ThinkGeek
Welcome to geek heaven.
http://thinkgeek.com/sf
_______________________________________________
Opengfs-devel mailing list
https://lists.sourceforge.net/lists/listinfo/opengfs-devel