winlin

add st-1.9 to research

要显示太多修改。

为保证性能只显示 14 of 14+ 个文件。

  1 +# The contents of this file are subject to the Mozilla Public
  2 +# License Version 1.1 (the "License"); you may not use this file
  3 +# except in compliance with the License. You may obtain a copy of
  4 +# the License at http://www.mozilla.org/MPL/
  5 +#
  6 +# Software distributed under the License is distributed on an "AS
  7 +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  8 +# implied. See the License for the specific language governing
  9 +# rights and limitations under the License.
  10 +#
  11 +# The Original Code is the Netscape Portable Runtime library.
  12 +#
  13 +# The Initial Developer of the Original Code is Netscape
  14 +# Communications Corporation. Portions created by Netscape are
  15 +# Copyright (C) 1994-2000 Netscape Communications Corporation. All
  16 +# Rights Reserved.
  17 +#
  18 +# Contributor(s): Silicon Graphics, Inc.
  19 +#
  20 +# Portions created by SGI are Copyright (C) 2000-2001 Silicon
  21 +# Graphics, Inc. All Rights Reserved.
  22 +#
  23 +# Alternatively, the contents of this file may be used under the
  24 +# terms of the GNU General Public License Version 2 or later (the
  25 +# "GPL"), in which case the provisions of the GPL are applicable
  26 +# instead of those above. If you wish to allow use of your
  27 +# version of this file only under the terms of the GPL and not to
  28 +# allow others to use your version of this file under the MPL,
  29 +# indicate your decision by deleting the provisions above and
  30 +# replace them with the notice and other provisions required by
  31 +# the GPL. If you do not delete the provisions above, a recipient
  32 +# may use your version of this file under either the MPL or the
  33 +# GPL.
  34 +
  35 +# This is the full version of the libst library - modify carefully
  36 +VERSION = 1.9
  37 +
  38 +##########################
  39 +# Supported OSes:
  40 +#
  41 +#OS = AIX
  42 +#OS = CYGWIN
  43 +#OS = DARWIN
  44 +#OS = FREEBSD
  45 +#OS = HPUX
  46 +#OS = HPUX_64
  47 +#OS = IRIX
  48 +#OS = IRIX_64
  49 +#OS = LINUX
  50 +#OS = NETBSD
  51 +#OS = OPENBSD
  52 +#OS = OSF1
  53 +#OS = SOLARIS
  54 +#OS = SOLARIS_64
  55 +
  56 +# Please see the "Other possible defines" section below for
  57 +# possible compilation options.
  58 +##########################
  59 +
  60 +CC = cc
  61 +AR = ar
  62 +LD = ld
  63 +RANLIB = ranlib
  64 +LN = ln
  65 +
  66 +SHELL = /bin/sh
  67 +ECHO = /bin/echo
  68 +
  69 +BUILD = DBG
  70 +TARGETDIR = $(OS)_$(shell uname -r)_$(BUILD)
  71 +
  72 +DEFINES = -D$(OS)
  73 +CFLAGS =
  74 +SFLAGS =
  75 +ARFLAGS = -rv
  76 +LNFLAGS = -s
  77 +DSO_SUFFIX = so
  78 +
  79 +MAJOR = $(shell echo $(VERSION) | sed 's/^\([^\.]*\).*/\1/')
  80 +DESC = st.pc
  81 +
  82 +##########################
  83 +# Platform section.
  84 +# Possible targets:
  85 +
  86 +TARGETS = aix-debug aix-optimized \
  87 + cygwin-debug cygwin-optimized \
  88 + darwin-debug darwin-optimized \
  89 + freebsd-debug freebsd-optimized \
  90 + hpux-debug hpux-optimized \
  91 + hpux-64-debug hpux-64-optimized \
  92 + irix-n32-debug irix-n32-optimized \
  93 + irix-64-debug irix-64-optimized \
  94 + linux-debug linux-optimized \
  95 + netbsd-debug netbsd-optimized \
  96 + openbsd-debug openbsd-optimized \
  97 + osf1-debug osf1-optimized \
  98 + solaris-debug solaris-optimized \
  99 + solaris-64-debug solaris-64-optimized
  100 +
  101 +#
  102 +# Platform specifics
  103 +#
  104 +
  105 +ifeq ($(OS), AIX)
  106 +AIX_VERSION = $(shell uname -v).$(shell uname -r)
  107 +TARGETDIR = $(OS)_$(AIX_VERSION)_$(BUILD)
  108 +CC = xlC
  109 +STATIC_ONLY = yes
  110 +ifeq ($(BUILD), OPT)
  111 +OTHER_FLAGS = -w
  112 +endif
  113 +ifneq ($(filter-out 4.1 4.2, $(AIX_VERSION)),)
  114 +DEFINES += -DMD_HAVE_SOCKLEN_T
  115 +endif
  116 +endif
  117 +
  118 +ifeq ($(OS), CYGWIN)
  119 +TARGETDIR = $(OS)_$(BUILD)
  120 +CC = gcc
  121 +LD = gcc
  122 +DSO_SUFFIX = dll
  123 +SLIBRARY = $(TARGETDIR)/libst.dll.a
  124 +DLIBRARY = $(TARGETDIR)/libst.dll
  125 +DEF_FILE = $(TARGETDIR)/libst.def
  126 +LDFLAGS = libst.def -shared --enable-auto-image-base -Wl,--output-def,$(DEF_FILE),--out-implib,$(SLIBRARY)
  127 +OTHER_FLAGS = -Wall
  128 +endif
  129 +
  130 +ifeq ($(OS), DARWIN)
  131 +LD = cc
  132 +SFLAGS = -fPIC -fno-common
  133 +DSO_SUFFIX = dylib
  134 +RELEASE = $(shell uname -r | cut -d. -f1)
  135 +PPC = $(shell test $(RELEASE) -le 9 && echo yes)
  136 +INTEL = $(shell test $(RELEASE) -ge 9 && echo yes)
  137 +ifeq ($(PPC), yes)
  138 +CFLAGS += -arch ppc
  139 +LDFLAGS += -arch ppc
  140 +endif
  141 +ifeq ($(INTEL), yes)
  142 +CFLAGS += -arch i386 -arch x86_64
  143 +LDFLAGS += -arch i386 -arch x86_64
  144 +endif
  145 +LDFLAGS += -dynamiclib -install_name /sw/lib/libst.$(MAJOR).$(DSO_SUFFIX) -compatibility_version $(MAJOR) -current_version $(VERSION)
  146 +OTHER_FLAGS = -Wall
  147 +endif
  148 +
  149 +ifeq ($(OS), FREEBSD)
  150 +SFLAGS = -fPIC
  151 +LDFLAGS = -shared -soname=$(SONAME) -lc
  152 +OTHER_FLAGS = -Wall
  153 +ifeq ($(shell test -f /usr/include/sys/event.h && echo yes), yes)
  154 +DEFINES += -DMD_HAVE_KQUEUE
  155 +endif
  156 +endif
  157 +
  158 +ifeq (HPUX, $(findstring HPUX, $(OS)))
  159 +ifeq ($(OS), HPUX_64)
  160 +DEFINES = -DHPUX
  161 +CFLAGS = -Ae +DD64 +Z
  162 +else
  163 +CFLAGS = -Ae +DAportable +Z
  164 +endif
  165 +RANLIB = true
  166 +LDFLAGS = -b
  167 +DSO_SUFFIX = sl
  168 +endif
  169 +
  170 +ifeq (IRIX, $(findstring IRIX, $(OS)))
  171 +ifeq ($(OS), IRIX_64)
  172 +DEFINES = -DIRIX
  173 +ABIFLAG = -64
  174 +else
  175 +ABIFLAG = -n32
  176 +endif
  177 +RANLIB = true
  178 +CFLAGS = $(ABIFLAG) -mips3
  179 +LDFLAGS = $(ABIFLAG) -shared
  180 +OTHER_FLAGS = -fullwarn
  181 +endif
  182 +
  183 +ifeq ($(OS), LINUX)
  184 +EXTRA_OBJS = $(TARGETDIR)/md.o
  185 +SFLAGS = -fPIC
  186 +LDFLAGS = -shared -soname=$(SONAME) -lc
  187 +OTHER_FLAGS = -Wall
  188 +ifeq ($(shell test -f /usr/include/sys/epoll.h && echo yes), yes)
  189 +DEFINES += -DMD_HAVE_EPOLL
  190 +endif
  191 +endif
  192 +
  193 +ifeq ($(OS), NETBSD)
  194 +SFLAGS = -fPIC
  195 +LDFLAGS = -shared -soname=$(SONAME) -lc
  196 +OTHER_FLAGS = -Wall
  197 +endif
  198 +
  199 +ifeq ($(OS), OPENBSD)
  200 +SFLAGS = -fPIC
  201 +LDFLAGS = -shared -soname=$(SONAME) -lc
  202 +OTHER_FLAGS = -Wall
  203 +ifeq ($(shell test -f /usr/include/sys/event.h && echo yes), yes)
  204 +DEFINES += -DMD_HAVE_KQUEUE
  205 +endif
  206 +endif
  207 +
  208 +ifeq ($(OS), OSF1)
  209 +RANLIB = true
  210 +LDFLAGS = -shared -all -expect_unresolved "*"
  211 +endif
  212 +
  213 +ifeq (SOLARIS, $(findstring SOLARIS, $(OS)))
  214 +TARGETDIR = $(OS)_$(shell uname -r | sed 's/^5/2/')_$(BUILD)
  215 +CC = gcc
  216 +LD = gcc
  217 +RANLIB = true
  218 +LDFLAGS = -G
  219 +OTHER_FLAGS = -Wall
  220 +ifeq ($(OS), SOLARIS_64)
  221 +DEFINES = -DSOLARIS
  222 +CFLAGS += -m64
  223 +LDFLAGS += -m64
  224 +endif
  225 +endif
  226 +
  227 +#
  228 +# End of platform section.
  229 +##########################
  230 +
  231 +
  232 +ifeq ($(BUILD), OPT)
  233 +OTHER_FLAGS += -O
  234 +else
  235 +OTHER_FLAGS += -g
  236 +DEFINES += -DDEBUG
  237 +endif
  238 +
  239 +##########################
  240 +# Other possible defines:
  241 +# To use poll(2) instead of select(2) for events checking:
  242 +# DEFINES += -DUSE_POLL
  243 +# You may prefer to use select for applications that have many threads
  244 +# using one file descriptor, and poll for applications that have many
  245 +# different file descriptors. With USE_POLL poll() is called with at
  246 +# least one pollfd per I/O-blocked thread, so 1000 threads sharing one
  247 +# descriptor will poll 1000 identical pollfds and select would be more
  248 +# efficient. But if the threads all use different descriptors poll()
  249 +# may be better depending on your operating system's implementation of
  250 +# poll and select. Really, it's up to you. Oh, and on some platforms
  251 +# poll() fails with more than a few dozen descriptors.
  252 +#
  253 +# Some platforms allow to define FD_SETSIZE (if select() is used), e.g.:
  254 +# DEFINES += -DFD_SETSIZE=4096
  255 +#
  256 +# To use malloc(3) instead of mmap(2) for stack allocation:
  257 +# DEFINES += -DMALLOC_STACK
  258 +#
  259 +# To provision more than the default 16 thread-specific-data keys
  260 +# (but not too many!):
  261 +# DEFINES += -DST_KEYS_MAX=<n>
  262 +#
  263 +# To start with more than the default 64 initial pollfd slots
  264 +# (but the table grows dynamically anyway):
  265 +# DEFINES += -DST_MIN_POLLFDS_SIZE=<n>
  266 +#
  267 +# Note that you can also add these defines by specifying them as
  268 +# make/gmake arguments (without editing this Makefile). For example:
  269 +#
  270 +# make EXTRA_CFLAGS=-DUSE_POLL <target>
  271 +#
  272 +# (replace make with gmake if needed).
  273 +#
  274 +# You can also modify the default selection of an alternative event
  275 +# notification mechanism. E.g., to enable kqueue(2) support (if it's not
  276 +# enabled by default):
  277 +#
  278 +# gmake EXTRA_CFLAGS=-DMD_HAVE_KQUEUE <target>
  279 +#
  280 +# or to disable default epoll(4) support:
  281 +#
  282 +# make EXTRA_CFLAGS=-UMD_HAVE_EPOLL <target>
  283 +#
  284 +##########################
  285 +
  286 +CFLAGS += $(DEFINES) $(OTHER_FLAGS) $(EXTRA_CFLAGS)
  287 +
  288 +OBJS = $(TARGETDIR)/sched.o \
  289 + $(TARGETDIR)/stk.o \
  290 + $(TARGETDIR)/sync.o \
  291 + $(TARGETDIR)/key.o \
  292 + $(TARGETDIR)/io.o \
  293 + $(TARGETDIR)/event.o
  294 +OBJS += $(EXTRA_OBJS)
  295 +HEADER = $(TARGETDIR)/st.h
  296 +SLIBRARY = $(TARGETDIR)/libst.a
  297 +DLIBRARY = $(TARGETDIR)/libst.$(DSO_SUFFIX).$(VERSION)
  298 +EXAMPLES = examples
  299 +
  300 +LINKNAME = libst.$(DSO_SUFFIX)
  301 +SONAME = libst.$(DSO_SUFFIX).$(MAJOR)
  302 +FULLNAME = libst.$(DSO_SUFFIX).$(VERSION)
  303 +
  304 +ifeq ($(OS), CYGWIN)
  305 +SONAME = cygst.$(DSO_SUFFIX)
  306 +SLIBRARY = $(TARGETDIR)/libst.dll.a
  307 +DLIBRARY = $(TARGETDIR)/$(SONAME)
  308 +LINKNAME =
  309 +# examples directory does not compile under cygwin
  310 +EXAMPLES =
  311 +endif
  312 +
  313 +ifeq ($(OS), DARWIN)
  314 +LINKNAME = libst.$(DSO_SUFFIX)
  315 +SONAME = libst.$(MAJOR).$(DSO_SUFFIX)
  316 +FULLNAME = libst.$(VERSION).$(DSO_SUFFIX)
  317 +endif
  318 +
  319 +ifeq ($(STATIC_ONLY), yes)
  320 +LIBRARIES = $(SLIBRARY)
  321 +else
  322 +LIBRARIES = $(SLIBRARY) $(DLIBRARY)
  323 +endif
  324 +
  325 +ifeq ($(OS),)
  326 +ST_ALL = unknown
  327 +else
  328 +ST_ALL = $(TARGETDIR) $(LIBRARIES) $(HEADER) $(EXAMPLES) $(DESC)
  329 +endif
  330 +
  331 +all: $(ST_ALL)
  332 +
  333 +unknown:
  334 + @echo
  335 + @echo "Please specify one of the following targets:"
  336 + @echo
  337 + @for target in $(TARGETS); do echo $$target; done
  338 + @echo
  339 +
  340 +st.pc: st.pc.in
  341 + sed "s/@VERSION@/${VERSION}/g" < $< > $@
  342 +
  343 +$(TARGETDIR):
  344 + if [ ! -d $(TARGETDIR) ]; then mkdir $(TARGETDIR); fi
  345 +
  346 +$(SLIBRARY): $(OBJS)
  347 + $(AR) $(ARFLAGS) $@ $(OBJS)
  348 + $(RANLIB) $@
  349 + rm -f obj; $(LN) $(LNFLAGS) $(TARGETDIR) obj
  350 +
  351 +$(DLIBRARY): $(OBJS:%.o=%-pic.o)
  352 + $(LD) $(LDFLAGS) $^ -o $@
  353 + if test "$(LINKNAME)"; then \
  354 + cd $(TARGETDIR); \
  355 + rm -f $(SONAME) $(LINKNAME); \
  356 + $(LN) $(LNFLAGS) $(FULLNAME) $(SONAME); \
  357 + $(LN) $(LNFLAGS) $(FULLNAME) $(LINKNAME); \
  358 + fi
  359 +
  360 +$(HEADER): public.h
  361 + rm -f $@
  362 + cp public.h $@
  363 +
  364 +$(TARGETDIR)/md.o: md.S
  365 + $(CC) $(CFLAGS) -c $< -o $@
  366 +
  367 +$(TARGETDIR)/%.o: %.c common.h md.h
  368 + $(CC) $(CFLAGS) -c $< -o $@
  369 +
  370 +examples::
  371 + @echo Making $@
  372 + @cd $@; $(MAKE) CC="$(CC)" CFLAGS="$(CFLAGS)" OS="$(OS)" TARGETDIR="$(TARGETDIR)"
  373 +
  374 +clean:
  375 + rm -rf *_OPT *_DBG obj st.pc
  376 +
  377 +##########################
  378 +# Pattern rules:
  379 +
  380 +ifneq ($(SFLAGS),)
  381 +# Compile with shared library options if it's a C file
  382 +$(TARGETDIR)/%-pic.o: %.c common.h md.h
  383 + $(CC) $(CFLAGS) $(SFLAGS) -c $< -o $@
  384 +endif
  385 +
  386 +# Compile assembly as normal or C as normal if no SFLAGS
  387 +%-pic.o: %.o
  388 + rm -f $@; $(LN) $(LNFLAGS) $(<F) $@
  389 +
  390 +##########################
  391 +# Target rules:
  392 +
  393 +default-debug:
  394 + . ./osguess.sh; $(MAKE) OS="$$OS" BUILD="DBG"
  395 +default default-optimized:
  396 + . ./osguess.sh; $(MAKE) OS="$$OS" BUILD="OPT"
  397 +
  398 +aix-debug:
  399 + $(MAKE) OS="AIX" BUILD="DBG"
  400 +aix-optimized:
  401 + $(MAKE) OS="AIX" BUILD="OPT"
  402 +
  403 +cygwin-debug:
  404 + $(MAKE) OS="CYGWIN" BUILD="DBG"
  405 +cygwin-optimized:
  406 + $(MAKE) OS="CYGWIN" BUILD="OPT"
  407 +
  408 +darwin-debug:
  409 + $(MAKE) OS="DARWIN" BUILD="DBG"
  410 +darwin-optimized:
  411 + $(MAKE) OS="DARWIN" BUILD="OPT"
  412 +
  413 +freebsd-debug:
  414 + $(MAKE) OS="FREEBSD" BUILD="DBG"
  415 +freebsd-optimized:
  416 + $(MAKE) OS="FREEBSD" BUILD="OPT"
  417 +
  418 +hpux-debug:
  419 + $(MAKE) OS="HPUX" BUILD="DBG"
  420 +hpux-optimized:
  421 + $(MAKE) OS="HPUX" BUILD="OPT"
  422 +hpux-64-debug:
  423 + $(MAKE) OS="HPUX_64" BUILD="DBG"
  424 +hpux-64-optimized:
  425 + $(MAKE) OS="HPUX_64" BUILD="OPT"
  426 +
  427 +irix-n32-debug:
  428 + $(MAKE) OS="IRIX" BUILD="DBG"
  429 +irix-n32-optimized:
  430 + $(MAKE) OS="IRIX" BUILD="OPT"
  431 +irix-64-debug:
  432 + $(MAKE) OS="IRIX_64" BUILD="DBG"
  433 +irix-64-optimized:
  434 + $(MAKE) OS="IRIX_64" BUILD="OPT"
  435 +
  436 +linux-debug:
  437 + $(MAKE) OS="LINUX" BUILD="DBG"
  438 +linux-optimized:
  439 + $(MAKE) OS="LINUX" BUILD="OPT"
  440 +# compatibility
  441 +linux-ia64-debug: linux-debug
  442 +linux-ia64-optimized: linux-optimized
  443 +
  444 +netbsd-debug:
  445 + $(MAKE) OS="NETBSD" BUILD="DBG"
  446 +netbsd-optimized:
  447 + $(MAKE) OS="NETBSD" BUILD="OPT"
  448 +
  449 +openbsd-debug:
  450 + $(MAKE) OS="OPENBSD" BUILD="DBG"
  451 +openbsd-optimized:
  452 + $(MAKE) OS="OPENBSD" BUILD="OPT"
  453 +
  454 +osf1-debug:
  455 + $(MAKE) OS="OSF1" BUILD="DBG"
  456 +osf1-optimized:
  457 + $(MAKE) OS="OSF1" BUILD="OPT"
  458 +
  459 +solaris-debug:
  460 + $(MAKE) OS="SOLARIS" BUILD="DBG"
  461 +solaris-optimized:
  462 + $(MAKE) OS="SOLARIS" BUILD="OPT"
  463 +solaris-64-debug:
  464 + $(MAKE) OS="SOLARIS_64" BUILD="DBG"
  465 +solaris-64-optimized:
  466 + $(MAKE) OS="SOLARIS_64" BUILD="OPT"
  467 +
  468 +##########################
  469 +
  1 +WELCOME!
  2 +
  3 +The State Threads Library is a small application library which provides
  4 +a foundation for writing fast and highly scalable Internet applications
  5 +(such as web servers, proxy servers, mail transfer agents, and so on,
  6 +really any network-data-driven application) on UNIX-like platforms. It
  7 +combines the simplicity of the multithreaded programming paradigm, in
  8 +which one thread supports each simultaneous connection, with the
  9 +performance and scalability of an event-driven state machine
  10 +architecture. In other words, this library offers a threading API for
  11 +structuring an Internet application as a state machine. For more
  12 +details, please see the library documentation in the "docs" directory or
  13 +on-line at
  14 +
  15 + http://state-threads.sourceforge.net/docs/
  16 +
  17 +The State Threads Project is an open source project for maintaining and
  18 +enhancing the State Threads Library. For more information about this
  19 +project, please see
  20 +
  21 + http://state-threads.sourceforge.net/
  22 +
  23 +
  24 +BUILDING
  25 +
  26 +To build the library by hand, use the GNU make utility. Run the make
  27 +command (e.g., `gmake') with no arguments to display all supported
  28 +targets.
  29 +
  30 +To build more or less automatically, first set the CONFIG_GUESS_PATH
  31 +variable in either osguess.sh or your environment then run "make
  32 +default" which guesses your OS and builds. Requires the "config.guess"
  33 +utility from GNU autoconf (not included with ST). You can use one from
  34 +a larger "main" software project or just use any config.guess available
  35 +on your system. You can also get it directly from GNU:
  36 +ftp://ftp.gnu.org/gnu/autoconf/
  37 +
  38 +To build rpms (RedHat Linux 6.2 or later, Linux/Mandrake, Solaris with
  39 +gnome, etc.):
  40 + download the latest st-x.y.tar.gz
  41 + # rpm -ta st-x.y.tar.gz
  42 +The .rpms will land in /usr/src/RPMS/<arch>. Install them with:
  43 + # rpm -i libst*.rpm
  44 +Requires GNU automake and rpm 3.0.3 or later.
  45 +
  46 +Debian users:
  47 + If you run potato, please upgrade to woody.
  48 + If you run woody, "apt-get install libst-dev" will get you v1.3.
  49 + If you run testing/unstable, you will get the newest available version.
  50 + If you *must* have the newest libst in woody, you may follow these
  51 + not-recommended instructions:
  52 + 1. Add "deb-src <your-favourite-debian-mirror> unstable main" to your
  53 + /etc/apt/sources.list
  54 + 2. apt-get update
  55 + 3. apt-get source st
  56 + 4. cd st-1.4 (or whatever version you got)
  57 + 5. debuild
  58 + 6. dpkg -i ../*.deb
  59 +
  60 +If your application uses autoconf to search for dependencies and you
  61 +want to search for a given version of libst, you can simply add
  62 + PKG_CHECK_MODULES(MYAPP, st >= 1.3 mumble >= 0.2.23)
  63 +to your configure.ac/in. This will define @MYAPP_LIBS@ and
  64 +@MYAPP_CFLAGS@ which you may then use in your Makefile.am/in files to
  65 +link against mumble and st.
  66 +
  67 +
  68 +LICENSE
  69 +
  70 +The State Threads library is a derivative of the Netscape Portable
  71 +Runtime library (NSPR). All source code in this directory is
  72 +distributed under the terms of the Mozilla Public License (MPL) version
  73 +1.1 or the GNU General Public License (GPL) version 2 or later. For
  74 +more information about these licenses please see
  75 +http://www.mozilla.org/MPL/ and http://www.gnu.org/copyleft/.
  76 +
  77 +All source code in the "examples" directory is distributed under the BSD
  78 +style license.
  79 +
  80 +
  81 +PLATFORMS
  82 +
  83 +Please see the "docs/notes.html" file for the list of currently
  84 +supported platforms.
  85 +
  86 +
  87 +DEBUGGER SUPPORT
  88 +
  89 +It's almost impossible to print SP and PC in a portable way. The only
  90 +way to see thread's stack platform-independently is to actually jump to
  91 +the saved context. That's what the _st_iterate_threads() function does.
  92 +Do the following to iterate over all threads:
  93 +
  94 +- set the _st_iterate_threads_flag to 1 in debugger
  95 +- set breakpoint at the _st_show_thread_stack() function
  96 + (which does nothing)
  97 +- call the _st_iterate_threads() function which jumps to the
  98 + next thread
  99 +- at each break you can explore thread's stack
  100 +- continue
  101 +- when iteration is complete, you return to the original
  102 + point (you can see thread id and a message as arguments of
  103 + the _st_show_thread_stack() function).
  104 +
  105 +You can call _st_iterate_threads() in three ways:
  106 +
  107 +- Insert it into your source code at the point you want to
  108 + go over threads.
  109 +- Just run application and this function will be called at
  110 + the first context switch.
  111 +- Call it directly from the debugger at any point.
  112 +
  113 +This works with gdb and dbx.
  114 +
  115 +Example using gdb:
  116 +
  117 +(gdb) set _st_iterate_threads_flag = 1
  118 +(gdb) b _st_show_thread_stack
  119 +...
  120 +(gdb) call _st_iterate_threads()
  121 +...
  122 +(gdb) bt
  123 +...
  124 +(gdb) c
  125 +...
  126 +(gdb) bt
  127 +...
  128 +(gdb) c
  129 +...
  130 +and so on...
  131 +
  132 +_st_iterate_threads_flag will be set to 0 automatically
  133 +after iteration is over or you can set it to 0 at any time
  134 +to stop iteration.
  135 +
  136 +Sometimes gdb complains about SIGSEGV when you call a function
  137 +directly at gdb command-line. It can be ignored -- just call the
  138 +same function right away again, it works just fine. For example:
  139 +
  140 +(gdb) set _st_iterate_threads_flag = 1
  141 +(gdb) b _st_show_thread_stack
  142 +Breakpoint 1 at 0x809bbbb: file sched.c, line 856.
  143 +(gdb) call _st_iterate_threads()
  144 +Program received signal SIGSEGV, Segmentation fault.
  145 +....
  146 +(gdb) # just call the function again:
  147 +(gdb) call _st_iterate_threads()
  148 +Breakpoint 1, _st_show_thread_stack (thread=0x4017aee4, messg=0x80ae7a2
  149 +"Iteration started") at sched.c:856
  150 +856 }
  151 +....
  152 +
  153 +You can use simple gdb command-line scripting to display
  154 +all threads and their stack traces at once:
  155 +
  156 +(gdb) while _st_iterate_threads_flag
  157 + >bt
  158 + >c
  159 + >end
  160 +....
  161 +
  162 +Another script to stop at the thread with the specific thread id
  163 +(e.g., 0x40252ee4):
  164 +
  165 +(gdb) # set the flag again:
  166 +(gdb) set _st_iterate_threads_flag = 1
  167 +(gdb) call _st_iterate_threads()
  168 +Breakpoint 1, _st_show_thread_stack (thread=0x4017aee4, messg=0x80ae7a2
  169 +"Iteration started") at sched.c:856
  170 +856 }
  171 +....
  172 +(gdb) while thread != 0x40252ee4
  173 + >c
  174 + >end
  175 +....
  176 +....
  177 +Breakpoint 1, _st_show_thread_stack (thread=0x40252ee4, messg=0x0) at
  178 +sched.c:856
  179 +856 }
  180 +(gdb) bt
  181 +....
  182 +(gdb) # don't want to continue iteration, unset the flag:
  183 +(gdb) set _st_iterate_threads_flag = 0
  184 +(gdb) c
  185 +Continuing.
  186 +Breakpoint 1, _st_show_thread_stack (thread=0x0, messg=0x80ae78e "Iteration
  187 +completed")
  188 + at sched.c:856
  189 +856 }
  190 +(gdb) c
  191 +Continuing.
  192 +(gdb) return
  193 +Make selected stack frame return now? (y or n) y
  194 +#0 0x4011254e in __select ()
  195 + from /lib/libc.so.6
  196 +(gdb) detach
  197 +
  198 +
  199 +CHANGE LOG
  200 +
  201 +Changes from 1.8 to 1.9.
  202 +------------------------
  203 +o Support 32-bit and 64-bit Intel Macs.
  204 +
  205 +o Added ST_VERSION string, and ST_VERSION_MAJOR and ST_VERSION_MINOR
  206 + [bug 1796801].
  207 +
  208 +o Fixed some compiler warnings, based on a patch from Brian Wellington
  209 + [bug 1932741].
  210 +
  211 +
  212 +Changes from 1.7 to 1.8.
  213 +--------------------------
  214 +o Added support for kqueue and epoll on platforms that support them.
  215 + Added ability to choose the event notification system at program
  216 + startup.
  217 +
  218 +o Long-overdue public definitions of ST_UTIME_NO_TIMEOUT (-1ULL) and
  219 + ST_UTIME_NO_WAIT (0) [bug 1514436].
  220 +
  221 +o Documentation patch for st_utime() [bug 1514484].
  222 +
  223 +o Documentation patch for st_timecache_set() [bug 1514486].
  224 +
  225 +o Documentation patch for st_netfd_serialize_accept() [bug 1514494].
  226 +
  227 +o Added st_writev_resid() [rfe 1538344].
  228 +
  229 +o Added st_readv_resid() [rfe 1538768] and, for symmetry, st_readv().
  230 +
  231 +
  232 +Changes from 1.6 to 1.7.
  233 +------------------------
  234 +o Support glibc 2.4, which breaks programs that manipulate jump buffers.
  235 + Replaced Linux IA64 special cases with new md.S that covers all
  236 + Linux.
  237 +
  238 +
  239 +Changes from 1.5.2 to 1.6.
  240 +--------------------------
  241 +none
  242 +
  243 +
  244 +Changes from 1.5.1 to 1.5.2.
  245 +----------------------------
  246 +o Alfred Perlstein's context switch callback feature.
  247 +
  248 +o Claus Assmann's st_recvmsg/st_sendmsg wrappers.
  249 +
  250 +o Extra stack padding for platforms that need it.
  251 +
  252 +o Ron Arts's timeout clarifications in the reference manual.
  253 +
  254 +o Raymond Bero and Anton Berezin's AMD64 FreeBSD port.
  255 +
  256 +o Claus Assmann's AMD64 SunOS 5.10 port.
  257 +
  258 +o Claus Assmann's AMD64 OpenBSD port.
  259 +
  260 +o Michael Abd-El-Malek's Mac OS X port.
  261 +
  262 +o Michael Abd-El-Malek's stack printing patch.
  263 +
  264 +
  265 +Changes from 1.5.0 to 1.5.1.
  266 +----------------------------
  267 +o Andreas Gustafsson's USE_POLL fix.
  268 +
  269 +o Gene's st_set_utime_function() enhancement.
  270 +
  271 +
  272 +Changes from 1.4 to 1.5.0.
  273 +--------------------------
  274 +o Andreas Gustafsson's performance patch.
  275 +
  276 +o New extensions: Improved DNS resolver, generic LRU cache, in-process
  277 + DNS cache, and a program to test the resolver and cache.
  278 +
  279 +o Support for AMD Opteron 64-bit CPUs under Linux.
  280 +
  281 +o Support for SPARC-64 under Solaris.
  282 +
  283 +o Andreas Gustafsson's support for VAX under NetBSD.
  284 +
  285 +o Changed unportable #warning directives in md.h to #error.
  286 +
  287 +
  288 +Changes from 1.3 to 1.4.
  289 +------------------------
  290 +o Andreas Gustafsson's NetBSD port.
  291 +
  292 +o Wesley W. Terpstra's Darwin (MacOS X) port.
  293 +
  294 +o Support for many CPU architectures under Linux and *BSD.
  295 +
  296 +o Renamed private typedefs so they don't conflict with public ones any
  297 + more.
  298 +
  299 +o common.h now includes public.h for strict prototyping.
  300 +
  301 +o Joshua Levy's recommendation to make st_connect() and st_sendto()
  302 + accept const struct sockaddr pointers, as the originals do.
  303 +
  304 +o Clarified the documentation regarding blocking vs. non-blocking I/O.
  305 +
  306 +o Cygwin support.
  307 +
  308 +o Created the extensions directory.
  309 +
  310 +o Fixed warnings from ia64asm.S.
  311 +
  312 +
  313 +Changes from 1.2 to 1.3.
  314 +------------------------
  315 +o Added st_read_resid() and st_write_resid() to allow the caller to know
  316 + how much data was transferred before an error occurred. Updated
  317 + documentation.
  318 +
  319 +o Updated project link, copyrights, and documentation regarding
  320 + timeouts. Added comment to st_connect().
  321 +
  322 +o Optimized the _st_add_sleep_q() function in sched.c. Now we walk the
  323 + sleep queue *backward* when inserting a thread into it. When you
  324 + have lots (hundreds) of threads and several timeout values, it takes
  325 + a while to insert a thread at the appropriate point in the sleep
  326 + queue. The idea is that often this appropriate point is closer to
  327 + the end of the queue rather than the beginning. Measurements show
  328 + performance improves with this change. In any case this change
  329 + should do no harm.
  330 +
  331 +o Added a hint of when to define USE_POLL and when not to, to the
  332 + Makefile.
  333 +
  334 +o Added debugging support (files common.h and sched.c). See above.
  335 +
  336 +o Decreased the number of reallocations of _ST_POLLFDS in sched.c.
  337 + Inspired by Lev Walkin.
  338 +
  339 +o Fixed st_usleep(-1) and st_sleep(-1), and added a warning to the
  340 + documentation about too-large timeouts.
  341 +
  342 +o Linux/*BSD Alpha port.
  343 +
  344 +o Wesley W. Terpstra modernized the build process:
  345 + - properly build relocatable libraries under bsd and linux
  346 + - use library versioning
  347 + - added rpm spec file
  348 + - added debian/ files
  349 + See above for build instructions.
  350 +
  351 +
  352 +Changes from 1.1 to 1.2.
  353 +------------------------
  354 +o Added st_randomize_stacks().
  355 +
  356 +o Added a patch contributed by Sascha Schumann.
  357 +
  358 +
  359 +Changes from 1.0 to 1.1.
  360 +------------------------
  361 +o Relicensed under dual MPL-GPL.
  362 +
  363 +o OpenBSD port.
  364 +
  365 +o Compile-time option to use poll() instead of select() for
  366 + event polling (see Makefile).
  367 + This is useful if you want to support a large number of open
  368 + file descriptors (larger than FD_SETSIZE) within a single
  369 + process.
  370 +
  371 +o Linux IA-64 port.
  372 + Two issues make IA-64 different from other platforms:
  373 +
  374 + - Besides the traditional call stack in memory, IA-64 uses the
  375 + general register stack. Thus each thread needs a backing store
  376 + for the register stack in addition to the memory stack.
  377 +
  378 + - Current implementation of setjmp()/longjmp() can not be used
  379 + for thread context-switching since it assumes that only one
  380 + register stack exists. Using special assembly functions for
  381 + context-switching is unavoidable.
  382 +
  383 +o Thread stack capping on IRIX.
  384 + This allows some profiling tools (such as SpeedShop) to know when
  385 + to stop unwinding the stack. Without this libexc, used by SpeedShop,
  386 + traces right off the stack and crashes.
  387 +
  388 +o Miscellaneous documentation additions.
  389 +
  390 +
  391 +COPYRIGHTS
  392 +
  393 +Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  394 +All Rights Reserved.
  1 +/*
  2 + * The contents of this file are subject to the Mozilla Public
  3 + * License Version 1.1 (the "License"); you may not use this file
  4 + * except in compliance with the License. You may obtain a copy of
  5 + * the License at http://www.mozilla.org/MPL/
  6 + *
  7 + * Software distributed under the License is distributed on an "AS
  8 + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  9 + * implied. See the License for the specific language governing
  10 + * rights and limitations under the License.
  11 + *
  12 + * The Original Code is the Netscape Portable Runtime library.
  13 + *
  14 + * The Initial Developer of the Original Code is Netscape
  15 + * Communications Corporation. Portions created by Netscape are
  16 + * Copyright (C) 1994-2000 Netscape Communications Corporation. All
  17 + * Rights Reserved.
  18 + *
  19 + * Contributor(s): Silicon Graphics, Inc.
  20 + *
  21 + * Portions created by SGI are Copyright (C) 2000-2001 Silicon
  22 + * Graphics, Inc. All Rights Reserved.
  23 + *
  24 + * Alternatively, the contents of this file may be used under the
  25 + * terms of the GNU General Public License Version 2 or later (the
  26 + * "GPL"), in which case the provisions of the GPL are applicable
  27 + * instead of those above. If you wish to allow use of your
  28 + * version of this file only under the terms of the GPL and not to
  29 + * allow others to use your version of this file under the MPL,
  30 + * indicate your decision by deleting the provisions above and
  31 + * replace them with the notice and other provisions required by
  32 + * the GPL. If you do not delete the provisions above, a recipient
  33 + * may use your version of this file under either the MPL or the
  34 + * GPL.
  35 + */
  36 +
  37 +/*
  38 + * This file is derived directly from Netscape Communications Corporation,
  39 + * and consists of extensive modifications made during the year(s) 1999-2000.
  40 + */
  41 +
  42 +#ifndef __ST_COMMON_H__
  43 +#define __ST_COMMON_H__
  44 +
  45 +#include <stddef.h>
  46 +#include <unistd.h>
  47 +#include <sys/types.h>
  48 +#include <sys/time.h>
  49 +#include <setjmp.h>
  50 +
  51 +/* Enable assertions only if DEBUG is defined */
  52 +#ifndef DEBUG
  53 +#define NDEBUG
  54 +#endif
  55 +#include <assert.h>
  56 +#define ST_ASSERT(expr) assert(expr)
  57 +
  58 +#define ST_BEGIN_MACRO {
  59 +#define ST_END_MACRO }
  60 +
  61 +#ifdef DEBUG
  62 +#define ST_HIDDEN /*nothing*/
  63 +#else
  64 +#define ST_HIDDEN static
  65 +#endif
  66 +
  67 +#include "public.h"
  68 +#include "md.h"
  69 +
  70 +
  71 +/*****************************************
  72 + * Circular linked list definitions
  73 + */
  74 +
  75 +typedef struct _st_clist {
  76 + struct _st_clist *next;
  77 + struct _st_clist *prev;
  78 +} _st_clist_t;
  79 +
  80 +/* Insert element "_e" into the list, before "_l" */
  81 +#define ST_INSERT_BEFORE(_e,_l) \
  82 + ST_BEGIN_MACRO \
  83 + (_e)->next = (_l); \
  84 + (_e)->prev = (_l)->prev; \
  85 + (_l)->prev->next = (_e); \
  86 + (_l)->prev = (_e); \
  87 + ST_END_MACRO
  88 +
  89 +/* Insert element "_e" into the list, after "_l" */
  90 +#define ST_INSERT_AFTER(_e,_l) \
  91 + ST_BEGIN_MACRO \
  92 + (_e)->next = (_l)->next; \
  93 + (_e)->prev = (_l); \
  94 + (_l)->next->prev = (_e); \
  95 + (_l)->next = (_e); \
  96 + ST_END_MACRO
  97 +
  98 +/* Return the element following element "_e" */
  99 +#define ST_NEXT_LINK(_e) ((_e)->next)
  100 +
  101 +/* Append an element "_e" to the end of the list "_l" */
  102 +#define ST_APPEND_LINK(_e,_l) ST_INSERT_BEFORE(_e,_l)
  103 +
  104 +/* Insert an element "_e" at the head of the list "_l" */
  105 +#define ST_INSERT_LINK(_e,_l) ST_INSERT_AFTER(_e,_l)
  106 +
  107 +/* Return the head/tail of the list */
  108 +#define ST_LIST_HEAD(_l) (_l)->next
  109 +#define ST_LIST_TAIL(_l) (_l)->prev
  110 +
  111 +/* Remove the element "_e" from it's circular list */
  112 +#define ST_REMOVE_LINK(_e) \
  113 + ST_BEGIN_MACRO \
  114 + (_e)->prev->next = (_e)->next; \
  115 + (_e)->next->prev = (_e)->prev; \
  116 + ST_END_MACRO
  117 +
  118 +/* Return non-zero if the given circular list "_l" is empty, */
  119 +/* zero if the circular list is not empty */
  120 +#define ST_CLIST_IS_EMPTY(_l) \
  121 + ((_l)->next == (_l))
  122 +
  123 +/* Initialize a circular list */
  124 +#define ST_INIT_CLIST(_l) \
  125 + ST_BEGIN_MACRO \
  126 + (_l)->next = (_l); \
  127 + (_l)->prev = (_l); \
  128 + ST_END_MACRO
  129 +
  130 +#define ST_INIT_STATIC_CLIST(_l) \
  131 + {(_l), (_l)}
  132 +
  133 +
  134 +/*****************************************
  135 + * Basic types definitions
  136 + */
  137 +
  138 +typedef void (*_st_destructor_t)(void *);
  139 +
  140 +
  141 +typedef struct _st_stack {
  142 + _st_clist_t links;
  143 + char *vaddr; /* Base of stack's allocated memory */
  144 + int vaddr_size; /* Size of stack's allocated memory */
  145 + int stk_size; /* Size of usable portion of the stack */
  146 + char *stk_bottom; /* Lowest address of stack's usable portion */
  147 + char *stk_top; /* Highest address of stack's usable portion */
  148 + void *sp; /* Stack pointer from C's point of view */
  149 +#ifdef __ia64__
  150 + void *bsp; /* Register stack backing store pointer */
  151 +#endif
  152 +} _st_stack_t;
  153 +
  154 +
  155 +typedef struct _st_cond {
  156 + _st_clist_t wait_q; /* Condition variable wait queue */
  157 +} _st_cond_t;
  158 +
  159 +
  160 +typedef struct _st_thread _st_thread_t;
  161 +
  162 +struct _st_thread {
  163 + int state; /* Thread's state */
  164 + int flags; /* Thread's flags */
  165 +
  166 + void *(*start)(void *arg); /* The start function of the thread */
  167 + void *arg; /* Argument of the start function */
  168 + void *retval; /* Return value of the start function */
  169 +
  170 + _st_stack_t *stack; /* Info about thread's stack */
  171 +
  172 + _st_clist_t links; /* For putting on run/sleep/zombie queue */
  173 + _st_clist_t wait_links; /* For putting on mutex/condvar wait queue */
  174 +#ifdef DEBUG
  175 + _st_clist_t tlink; /* For putting on thread queue */
  176 +#endif
  177 +
  178 + st_utime_t due; /* Wakeup time when thread is sleeping */
  179 + _st_thread_t *left; /* For putting in timeout heap */
  180 + _st_thread_t *right; /* -- see docs/timeout_heap.txt for details */
  181 + int heap_index;
  182 +
  183 + void **private_data; /* Per thread private data */
  184 +
  185 + _st_cond_t *term; /* Termination condition variable for join */
  186 +
  187 + jmp_buf context; /* Thread's context */
  188 +};
  189 +
  190 +
  191 +typedef struct _st_mutex {
  192 + _st_thread_t *owner; /* Current mutex owner */
  193 + _st_clist_t wait_q; /* Mutex wait queue */
  194 +} _st_mutex_t;
  195 +
  196 +
  197 +typedef struct _st_pollq {
  198 + _st_clist_t links; /* For putting on io queue */
  199 + _st_thread_t *thread; /* Polling thread */
  200 + struct pollfd *pds; /* Array of poll descriptors */
  201 + int npds; /* Length of the array */
  202 + int on_ioq; /* Is it on ioq? */
  203 +} _st_pollq_t;
  204 +
  205 +
  206 +typedef struct _st_eventsys_ops {
  207 + const char *name; /* Name of this event system */
  208 + int val; /* Type of this event system */
  209 + int (*init)(void); /* Initialization */
  210 + void (*dispatch)(void); /* Dispatch function */
  211 + int (*pollset_add)(struct pollfd *, int); /* Add descriptor set */
  212 + void (*pollset_del)(struct pollfd *, int); /* Delete descriptor set */
  213 + int (*fd_new)(int); /* New descriptor allocated */
  214 + int (*fd_close)(int); /* Descriptor closed */
  215 + int (*fd_getlimit)(void); /* Descriptor hard limit */
  216 +} _st_eventsys_t;
  217 +
  218 +
  219 +typedef struct _st_vp {
  220 + _st_thread_t *idle_thread; /* Idle thread for this vp */
  221 + st_utime_t last_clock; /* The last time we went into vp_check_clock() */
  222 +
  223 + _st_clist_t run_q; /* run queue for this vp */
  224 + _st_clist_t io_q; /* io queue for this vp */
  225 + _st_clist_t zombie_q; /* zombie queue for this vp */
  226 +#ifdef DEBUG
  227 + _st_clist_t thread_q; /* all threads of this vp */
  228 +#endif
  229 + int pagesize;
  230 +
  231 + _st_thread_t *sleep_q; /* sleep queue for this vp */
  232 + int sleepq_size; /* number of threads on sleep queue */
  233 +
  234 +#ifdef ST_SWITCH_CB
  235 + st_switch_cb_t switch_out_cb; /* called when a thread is switched out */
  236 + st_switch_cb_t switch_in_cb; /* called when a thread is switched in */
  237 +#endif
  238 +} _st_vp_t;
  239 +
  240 +
  241 +typedef struct _st_netfd {
  242 + int osfd; /* Underlying OS file descriptor */
  243 + int inuse; /* In-use flag */
  244 + void *private_data; /* Per descriptor private data */
  245 + _st_destructor_t destructor; /* Private data destructor function */
  246 + void *aux_data; /* Auxiliary data for internal use */
  247 + struct _st_netfd *next; /* For putting on the free list */
  248 +} _st_netfd_t;
  249 +
  250 +
  251 +/*****************************************
  252 + * Current vp, thread, and event system
  253 + */
  254 +
  255 +extern _st_vp_t _st_this_vp;
  256 +extern _st_thread_t *_st_this_thread;
  257 +extern _st_eventsys_t *_st_eventsys;
  258 +
  259 +#define _ST_CURRENT_THREAD() (_st_this_thread)
  260 +#define _ST_SET_CURRENT_THREAD(_thread) (_st_this_thread = (_thread))
  261 +
  262 +#define _ST_LAST_CLOCK (_st_this_vp.last_clock)
  263 +
  264 +#define _ST_RUNQ (_st_this_vp.run_q)
  265 +#define _ST_IOQ (_st_this_vp.io_q)
  266 +#define _ST_ZOMBIEQ (_st_this_vp.zombie_q)
  267 +#ifdef DEBUG
  268 +#define _ST_THREADQ (_st_this_vp.thread_q)
  269 +#endif
  270 +
  271 +#define _ST_PAGE_SIZE (_st_this_vp.pagesize)
  272 +
  273 +#define _ST_SLEEPQ (_st_this_vp.sleep_q)
  274 +#define _ST_SLEEPQ_SIZE (_st_this_vp.sleepq_size)
  275 +
  276 +#define _ST_VP_IDLE() (*_st_eventsys->dispatch)()
  277 +
  278 +
  279 +/*****************************************
  280 + * vp queues operations
  281 + */
  282 +
  283 +#define _ST_ADD_IOQ(_pq) ST_APPEND_LINK(&_pq.links, &_ST_IOQ)
  284 +#define _ST_DEL_IOQ(_pq) ST_REMOVE_LINK(&_pq.links)
  285 +
  286 +#define _ST_ADD_RUNQ(_thr) ST_APPEND_LINK(&(_thr)->links, &_ST_RUNQ)
  287 +#define _ST_DEL_RUNQ(_thr) ST_REMOVE_LINK(&(_thr)->links)
  288 +
  289 +#define _ST_ADD_SLEEPQ(_thr, _timeout) _st_add_sleep_q(_thr, _timeout)
  290 +#define _ST_DEL_SLEEPQ(_thr) _st_del_sleep_q(_thr)
  291 +
  292 +#define _ST_ADD_ZOMBIEQ(_thr) ST_APPEND_LINK(&(_thr)->links, &_ST_ZOMBIEQ)
  293 +#define _ST_DEL_ZOMBIEQ(_thr) ST_REMOVE_LINK(&(_thr)->links)
  294 +
  295 +#ifdef DEBUG
  296 +#define _ST_ADD_THREADQ(_thr) ST_APPEND_LINK(&(_thr)->tlink, &_ST_THREADQ)
  297 +#define _ST_DEL_THREADQ(_thr) ST_REMOVE_LINK(&(_thr)->tlink)
  298 +#endif
  299 +
  300 +
  301 +/*****************************************
  302 + * Thread states and flags
  303 + */
  304 +
  305 +#define _ST_ST_RUNNING 0
  306 +#define _ST_ST_RUNNABLE 1
  307 +#define _ST_ST_IO_WAIT 2
  308 +#define _ST_ST_LOCK_WAIT 3
  309 +#define _ST_ST_COND_WAIT 4
  310 +#define _ST_ST_SLEEPING 5
  311 +#define _ST_ST_ZOMBIE 6
  312 +#define _ST_ST_SUSPENDED 7
  313 +
  314 +#define _ST_FL_PRIMORDIAL 0x01
  315 +#define _ST_FL_IDLE_THREAD 0x02
  316 +#define _ST_FL_ON_SLEEPQ 0x04
  317 +#define _ST_FL_INTERRUPT 0x08
  318 +#define _ST_FL_TIMEDOUT 0x10
  319 +
  320 +
  321 +/*****************************************
  322 + * Pointer conversion
  323 + */
  324 +
  325 +#ifndef offsetof
  326 +#define offsetof(type, identifier) ((size_t)&(((type *)0)->identifier))
  327 +#endif
  328 +
  329 +#define _ST_THREAD_PTR(_qp) \
  330 + ((_st_thread_t *)((char *)(_qp) - offsetof(_st_thread_t, links)))
  331 +
  332 +#define _ST_THREAD_WAITQ_PTR(_qp) \
  333 + ((_st_thread_t *)((char *)(_qp) - offsetof(_st_thread_t, wait_links)))
  334 +
  335 +#define _ST_THREAD_STACK_PTR(_qp) \
  336 + ((_st_stack_t *)((char*)(_qp) - offsetof(_st_stack_t, links)))
  337 +
  338 +#define _ST_POLLQUEUE_PTR(_qp) \
  339 + ((_st_pollq_t *)((char *)(_qp) - offsetof(_st_pollq_t, links)))
  340 +
  341 +#ifdef DEBUG
  342 +#define _ST_THREAD_THREADQ_PTR(_qp) \
  343 + ((_st_thread_t *)((char *)(_qp) - offsetof(_st_thread_t, tlink)))
  344 +#endif
  345 +
  346 +
  347 +/*****************************************
  348 + * Constants
  349 + */
  350 +
  351 +#ifndef ST_UTIME_NO_TIMEOUT
  352 +#define ST_UTIME_NO_TIMEOUT ((st_utime_t) -1LL)
  353 +#endif
  354 +
  355 +#ifndef __ia64__
  356 +#define ST_DEFAULT_STACK_SIZE (64*1024)
  357 +#else
  358 +#define ST_DEFAULT_STACK_SIZE (128*1024) /* Includes register stack size */
  359 +#endif
  360 +
  361 +#ifndef ST_KEYS_MAX
  362 +#define ST_KEYS_MAX 16
  363 +#endif
  364 +
  365 +#ifndef ST_MIN_POLLFDS_SIZE
  366 +#define ST_MIN_POLLFDS_SIZE 64
  367 +#endif
  368 +
  369 +
  370 +/*****************************************
  371 + * Threads context switching
  372 + */
  373 +
  374 +#ifdef DEBUG
  375 +void _st_iterate_threads(void);
  376 +#define ST_DEBUG_ITERATE_THREADS() _st_iterate_threads()
  377 +#else
  378 +#define ST_DEBUG_ITERATE_THREADS()
  379 +#endif
  380 +
  381 +#ifdef ST_SWITCH_CB
  382 +#define ST_SWITCH_OUT_CB(_thread) \
  383 + if (_st_this_vp.switch_out_cb != NULL && \
  384 + _thread != _st_this_vp.idle_thread && \
  385 + _thread->state != _ST_ST_ZOMBIE) { \
  386 + _st_this_vp.switch_out_cb(); \
  387 + }
  388 +#define ST_SWITCH_IN_CB(_thread) \
  389 + if (_st_this_vp.switch_in_cb != NULL && \
  390 + _thread != _st_this_vp.idle_thread && \
  391 + _thread->state != _ST_ST_ZOMBIE) { \
  392 + _st_this_vp.switch_in_cb(); \
  393 + }
  394 +#else
  395 +#define ST_SWITCH_OUT_CB(_thread)
  396 +#define ST_SWITCH_IN_CB(_thread)
  397 +#endif
  398 +
  399 +/*
  400 + * Switch away from the current thread context by saving its state and
  401 + * calling the thread scheduler
  402 + */
  403 +#define _ST_SWITCH_CONTEXT(_thread) \
  404 + ST_BEGIN_MACRO \
  405 + ST_SWITCH_OUT_CB(_thread); \
  406 + if (!MD_SETJMP((_thread)->context)) { \
  407 + _st_vp_schedule(); \
  408 + } \
  409 + ST_DEBUG_ITERATE_THREADS(); \
  410 + ST_SWITCH_IN_CB(_thread); \
  411 + ST_END_MACRO
  412 +
  413 +/*
  414 + * Restore a thread context that was saved by _ST_SWITCH_CONTEXT or
  415 + * initialized by _ST_INIT_CONTEXT
  416 + */
  417 +#define _ST_RESTORE_CONTEXT(_thread) \
  418 + ST_BEGIN_MACRO \
  419 + _ST_SET_CURRENT_THREAD(_thread); \
  420 + MD_LONGJMP((_thread)->context, 1); \
  421 + ST_END_MACRO
  422 +
  423 +/*
  424 + * Initialize the thread context preparing it to execute _main
  425 + */
  426 +#ifdef MD_INIT_CONTEXT
  427 +#define _ST_INIT_CONTEXT MD_INIT_CONTEXT
  428 +#else
  429 +#error Unknown OS
  430 +#endif
  431 +
  432 +/*
  433 + * Number of bytes reserved under the stack "bottom"
  434 + */
  435 +#define _ST_STACK_PAD_SIZE MD_STACK_PAD_SIZE
  436 +
  437 +
  438 +/*****************************************
  439 + * Forward declarations
  440 + */
  441 +
  442 +void _st_vp_schedule(void);
  443 +void _st_vp_check_clock(void);
  444 +void *_st_idle_thread_start(void *arg);
  445 +void _st_thread_main(void);
  446 +void _st_thread_cleanup(_st_thread_t *thread);
  447 +void _st_add_sleep_q(_st_thread_t *thread, st_utime_t timeout);
  448 +void _st_del_sleep_q(_st_thread_t *thread);
  449 +_st_stack_t *_st_stack_new(int stack_size);
  450 +void _st_stack_free(_st_stack_t *ts);
  451 +int _st_io_init(void);
  452 +
  453 +st_utime_t st_utime(void);
  454 +_st_cond_t *st_cond_new(void);
  455 +int st_cond_destroy(_st_cond_t *cvar);
  456 +int st_cond_timedwait(_st_cond_t *cvar, st_utime_t timeout);
  457 +int st_cond_signal(_st_cond_t *cvar);
  458 +ssize_t st_read(_st_netfd_t *fd, void *buf, size_t nbyte, st_utime_t timeout);
  459 +ssize_t st_write(_st_netfd_t *fd, const void *buf, size_t nbyte,
  460 + st_utime_t timeout);
  461 +int st_poll(struct pollfd *pds, int npds, st_utime_t timeout);
  462 +_st_thread_t *st_thread_create(void *(*start)(void *arg), void *arg,
  463 + int joinable, int stk_size);
  464 +
  465 +#endif /* !__ST_COMMON_H__ */
  466 +
  1 +<HTML>
  2 +<HEAD>
  3 +<TITLE>State Threads Library Programming Notes</TITLE>
  4 +</HEAD>
  5 +<BODY BGCOLOR=#FFFFFF>
  6 +<H2>Programming Notes</H2>
  7 +<P>
  8 +<B>
  9 +<UL>
  10 +<LI><A HREF=#porting>Porting</A></LI>
  11 +<LI><A HREF=#signals>Signals</A></LI>
  12 +<LI><A HREF=#intra>Intra-Process Synchronization</A></LI>
  13 +<LI><A HREF=#inter>Inter-Process Synchronization</A></LI>
  14 +<LI><A HREF=#nonnet>Non-Network I/O</A></LI>
  15 +<LI><A HREF=#timeouts>Timeouts</A></LI>
  16 +</UL>
  17 +</B>
  18 +<P>
  19 +<HR>
  20 +<P>
  21 +<A NAME="porting">
  22 +<H3>Porting</H3>
  23 +The State Threads library uses OS concepts that are available in some
  24 +form on most UNIX platforms, making the library very portable across
  25 +many flavors of UNIX. However, there are several parts of the library
  26 +that rely on platform-specific features. Here is the list of such parts:
  27 +<P>
  28 +<UL>
  29 +<LI><I>Thread context initialization</I>: Two ingredients of the
  30 +<TT>jmp_buf</TT>
  31 +data structure (the program counter and the stack pointer) have to be
  32 +manually set in the thread creation routine. The <TT>jmp_buf</TT> data
  33 +structure is defined in the <TT>setjmp.h</TT> header file and differs from
  34 +platform to platform. Usually the program counter is a structure member
  35 +with <TT>PC</TT> in the name and the stack pointer is a structure member
  36 +with <TT>SP</TT> in the name. One can also look in the
  37 +<A HREF="http://www.mozilla.org/source.html">Netscape's NSPR library source</A>
  38 +which already has this code for many UNIX-like platforms
  39 +(<TT>mozilla/nsprpub/pr/include/md/*.h</TT> files).
  40 +<P>
  41 +Note that on some BSD-derived platforms <TT>_setjmp(3)/_longjmp(3)</TT>
  42 +calls should be used instead of <TT>setjmp(3)/longjmp(3)</TT> (that is
  43 +the calls that manipulate only the stack and registers and do <I>not</I>
  44 +save and restore the process's signal mask).</LI>
  45 +<P>
  46 +Starting with glibc 2.4 on Linux the opacity of the <TT>jmp_buf</TT> data
  47 +structure is enforced by <TT>setjmp(3)/longjmp(3)</TT> so the
  48 +<TT>jmp_buf</TT> ingredients cannot be accessed directly anymore (unless
  49 +special environmental variable LD_POINTER_GUARD is set before application
  50 +execution). To avoid dependency on custom environment, the State Threads
  51 +library provides <TT>setjmp/longjmp</TT> replacement functions for
  52 +all Intel CPU architectures. Other CPU architectures can also be easily
  53 +supported (the <TT>setjmp/longjmp</TT> source code is widely available for
  54 +many CPU architectures).
  55 +<P>
  56 +<LI><I>High resolution time function</I>: Some platforms (IRIX, Solaris)
  57 +provide a high resolution time function based on the free running hardware
  58 +counter. This function returns the time counted since some arbitrary
  59 +moment in the past (usually machine power up time). It is not correlated in
  60 +any way to the time of day, and thus is not subject to resetting,
  61 +drifting, etc. This type of time is ideal for tasks where cheap, accurate
  62 +interval timing is required. If such a function is not available on a
  63 +particular platform, the <TT>gettimeofday(3)</TT> function can be used
  64 +(though on some platforms it involves a system call).
  65 +<P>
  66 +<LI><I>The stack growth direction</I>: The library needs to know whether the
  67 +stack grows toward lower (down) or higher (up) memory addresses.
  68 +One can write a simple test program that detects the stack growth direction
  69 +on a particular platform.</LI>
  70 +<P>
  71 +<LI><I>Non-blocking attribute inheritance</I>: On some platforms (e.g. IRIX)
  72 +the socket created as a result of the <TT>accept(2)</TT> call inherits the
  73 +non-blocking attribute of the listening socket. One needs to consult the manual
  74 +pages or write a simple test program to see if this applies to a specific
  75 +platform.</LI>
  76 +<P>
  77 +<LI><I>Anonymous memory mapping</I>: The library allocates memory segments
  78 +for thread stacks by doing anonymous memory mapping (<TT>mmap(2)</TT>). This
  79 +mapping is somewhat different on SVR4 and BSD4.3 derived platforms.
  80 +<P>
  81 +The memory mapping can be avoided altogether by using <TT>malloc(3)</TT> for
  82 +stack allocation. In this case the <TT>MALLOC_STACK</TT> macro should be
  83 +defined.</LI>
  84 +</UL>
  85 +<P>
  86 +All machine-dependent feature test macros should be defined in the
  87 +<TT>md.h</TT> header file. The assembly code for <TT>setjmp/longjmp</TT>
  88 +replacement functions for all CPU architectures should be placed in
  89 +the <TT>md.S</TT> file.
  90 +<P>
  91 +The current version of the library is ported to:
  92 +<UL>
  93 + <LI>IRIX 6.x (both 32 and 64 bit)</LI>
  94 + <LI>Linux (kernel 2.x and glibc 2.x) on x86, Alpha, MIPS and MIPSEL,
  95 + SPARC, ARM, PowerPC, 68k, HPPA, S390, IA-64, and Opteron (AMD-64)</LI>
  96 + <LI>Solaris 2.x (SunOS 5.x) on x86, AMD64, SPARC, and SPARC-64</LI>
  97 + <LI>AIX 4.x</LI>
  98 + <LI>HP-UX 11 (both 32 and 64 bit)</LI>
  99 + <LI>Tru64/OSF1</LI>
  100 + <LI>FreeBSD on x86, AMD64, and Alpha</LI>
  101 + <LI>OpenBSD on x86, AMD64, Alpha, and SPARC</LI>
  102 + <LI>NetBSD on x86, Alpha, SPARC, and VAX</LI>
  103 + <LI>MacOS X (Darwin) on PowerPC (32 bit) and Intel (both 32 and 64 bit) [universal]</LI>
  104 + <LI>Cygwin</LI>
  105 +</UL>
  106 +<P>
  107 +
  108 +<A NAME="signals">
  109 +<H3>Signals</H3>
  110 +Signal handling in an application using State Threads should be treated the
  111 +same way as in a classical UNIX process application. There is no such
  112 +thing as per-thread signal mask, all threads share the same signal handlers,
  113 +and only asynchronous-safe functions can be used in signal handlers.
  114 +However, there is a way to process signals synchronously by converting a
  115 +signal event to an I/O event: a signal catching function does a write to
  116 +a pipe which will be processed synchronously by a dedicated signal handling
  117 +thread. The following code demonstrates this technique (error handling is
  118 +omitted for clarity):
  119 +<PRE>
  120 +
  121 +/* Per-process pipe which is used as a signal queue. */
  122 +/* Up to PIPE_BUF/sizeof(int) signals can be queued up. */
  123 +int sig_pipe[2];
  124 +
  125 +/* Signal catching function. */
  126 +/* Converts signal event to I/O event. */
  127 +void sig_catcher(int signo)
  128 +{
  129 + int err;
  130 +
  131 + /* Save errno to restore it after the write() */
  132 + err = errno;
  133 + /* write() is reentrant/async-safe */
  134 + write(sig_pipe[1], &signo, sizeof(int));
  135 + errno = err;
  136 +}
  137 +
  138 +/* Signal processing function. */
  139 +/* This is the "main" function of the signal processing thread. */
  140 +void *sig_process(void *arg)
  141 +{
  142 + st_netfd_t nfd;
  143 + int signo;
  144 +
  145 + nfd = st_netfd_open(sig_pipe[0]);
  146 +
  147 + for ( ; ; ) {
  148 + /* Read the next signal from the pipe */
  149 + st_read(nfd, &signo, sizeof(int), ST_UTIME_NO_TIMEOUT);
  150 +
  151 + /* Process signal synchronously */
  152 + switch (signo) {
  153 + case SIGHUP:
  154 + /* do something here - reread config files, etc. */
  155 + break;
  156 + case SIGTERM:
  157 + /* do something here - cleanup, etc. */
  158 + break;
  159 + /* .
  160 + .
  161 + Other signals
  162 + .
  163 + .
  164 + */
  165 + }
  166 + }
  167 +
  168 + return NULL;
  169 +}
  170 +
  171 +int main(int argc, char *argv[])
  172 +{
  173 + struct sigaction sa;
  174 + .
  175 + .
  176 + .
  177 +
  178 + /* Create signal pipe */
  179 + pipe(sig_pipe);
  180 +
  181 + /* Create signal processing thread */
  182 + st_thread_create(sig_process, NULL, 0, 0);
  183 +
  184 + /* Install sig_catcher() as a signal handler */
  185 + sa.sa_handler = sig_catcher;
  186 + sigemptyset(&sa.sa_mask);
  187 + sa.sa_flags = 0;
  188 + sigaction(SIGHUP, &sa, NULL);
  189 +
  190 + sa.sa_handler = sig_catcher;
  191 + sigemptyset(&sa.sa_mask);
  192 + sa.sa_flags = 0;
  193 + sigaction(SIGTERM, &sa, NULL);
  194 +
  195 + .
  196 + .
  197 + .
  198 +
  199 +}
  200 +
  201 +</PRE>
  202 +<P>
  203 +Note that if multiple processes are used (see below), the signal pipe should
  204 +be initialized after the <TT>fork(2)</TT> call so that each process has its
  205 +own private pipe.
  206 +<P>
  207 +
  208 +<A NAME="intra">
  209 +<H3>Intra-Process Synchronization</H3>
  210 +Due to the event-driven nature of the library scheduler, the thread context
  211 +switch (process state change) can only happen in a well-known set of
  212 +library functions. This set includes functions in which a thread may
  213 +"block":<TT> </TT>I/O functions (<TT>st_read(), st_write(), </TT>etc.),
  214 +sleep functions (<TT>st_sleep(), </TT>etc.), and thread synchronization
  215 +functions (<TT>st_thread_join(), st_cond_wait(), </TT>etc.). As a result,
  216 +process-specific global data need not to be protected by locks since a thread
  217 +cannot be rescheduled while in a critical section (and only one thread at a
  218 +time can access the same memory location). By the same token,
  219 +non thread-safe functions (in a traditional sense) can be safely used with
  220 +the State Threads. The library's mutex facilities are practically useless
  221 +for a correctly written application (no blocking functions in critical
  222 +section) and are provided mostly for completeness. This absence of locking
  223 +greatly simplifies an application design and provides a foundation for
  224 +scalability.
  225 +<P>
  226 +
  227 +<A NAME="inter">
  228 +<H3>Inter-Process Synchronization</H3>
  229 +The State Threads library makes it possible to multiplex a large number
  230 +of simultaneous connections onto a much smaller number of separate
  231 +processes, where each process uses a many-to-one user-level threading
  232 +implementation (<B>N</B> of <B>M:1</B> mappings rather than one <B>M:N</B>
  233 +mapping used in native threading libraries on some platforms). This design
  234 +is key to the application's scalability. One can think about it as if a
  235 +set of all threads is partitioned into separate groups (processes) where
  236 +each group has a separate pool of resources (virtual address space, file
  237 +descriptors, etc.). An application designer has full control of how many
  238 +groups (processes) an application creates and what resources, if any,
  239 +are shared among different groups via standard UNIX inter-process
  240 +communication (IPC) facilities.<P>
  241 +There are several reasons for creating multiple processes:
  242 +<P>
  243 +<UL>
  244 +<LI>To take advantage of multiple hardware entities (CPUs, disks, etc.)
  245 +available in the system (hardware parallelism).</LI>
  246 +<P>
  247 +<LI>To reduce risk of losing a large number of user connections when one of
  248 +the processes crashes. For example, if <B>C</B> user connections (threads)
  249 +are multiplexed onto <B>P</B> processes and one of the processes crashes,
  250 +only a fraction (<B>C/P</B>) of all connections will be lost.</LI>
  251 +<P>
  252 +<LI>To overcome per-process resource limitations imposed by the OS. For
  253 +example, if <TT>select(2)</TT> is used for event polling, the number of
  254 +simultaneous connections (threads) per process is
  255 +limited by the <TT>FD_SETSIZE</TT> parameter (see <TT>select(2)</TT>).
  256 +If <TT>FD_SETSIZE</TT> is equal to 1024 and each connection needs one file
  257 +descriptor, then an application should create 10 processes to support 10,000
  258 +simultaneous connections.</LI>
  259 +</UL>
  260 +<P>
  261 +Ideally all user sessions are completely independent, so there is no need for
  262 +inter-process communication. It is always better to have several separate
  263 +smaller process-specific resources (e.g., data caches) than to have one large
  264 +resource shared (and modified) by all processes. Sometimes, however, there
  265 +is a need to share a common resource among different processes. In that case,
  266 +standard UNIX IPC facilities can be used. In addition to that, there is a way
  267 +to synchronize different processes so that only the thread accessing the
  268 +shared resource will be suspended (but not the entire process) if that resource
  269 +is unavailable. In the following code fragment a pipe is used as a counting
  270 +semaphore for inter-process synchronization:
  271 +<PRE>
  272 +#ifndef PIPE_BUF
  273 +#define PIPE_BUF 512 /* POSIX */
  274 +#endif
  275 +
  276 +/* Semaphore data structure */
  277 +typedef struct ipc_sem {
  278 + st_netfd_t rdfd; /* read descriptor */
  279 + st_netfd_t wrfd; /* write descriptor */
  280 +} ipc_sem_t;
  281 +
  282 +/* Create and initialize the semaphore. Should be called before fork(2). */
  283 +/* 'value' must be less than PIPE_BUF. */
  284 +/* If 'value' is 1, the semaphore works as mutex. */
  285 +ipc_sem_t *ipc_sem_create(int value)
  286 +{
  287 + ipc_sem_t *sem;
  288 + int p[2];
  289 + char b[PIPE_BUF];
  290 +
  291 + /* Error checking is omitted for clarity */
  292 + sem = malloc(sizeof(ipc_sem_t));
  293 +
  294 + /* Create the pipe */
  295 + pipe(p);
  296 + sem->rdfd = st_netfd_open(p[0]);
  297 + sem->wrfd = st_netfd_open(p[1]);
  298 +
  299 + /* Initialize the semaphore: put 'value' bytes into the pipe */
  300 + write(p[1], b, value);
  301 +
  302 + return sem;
  303 +}
  304 +
  305 +/* Try to decrement the "value" of the semaphore. */
  306 +/* If "value" is 0, the calling thread blocks on the semaphore. */
  307 +int ipc_sem_wait(ipc_sem_t *sem)
  308 +{
  309 + char c;
  310 +
  311 + /* Read one byte from the pipe */
  312 + if (st_read(sem->rdfd, &c, 1, ST_UTIME_NO_TIMEOUT) != 1)
  313 + return -1;
  314 +
  315 + return 0;
  316 +}
  317 +
  318 +/* Increment the "value" of the semaphore. */
  319 +int ipc_sem_post(ipc_sem_t *sem)
  320 +{
  321 + char c;
  322 +
  323 + if (st_write(sem->wrfd, &c, 1, ST_UTIME_NO_TIMEOUT) != 1)
  324 + return -1;
  325 +
  326 + return 0;
  327 +}
  328 +
  329 +</PRE>
  330 +<P>
  331 +
  332 +Generally, the following steps should be followed when writing an application
  333 +using the State Threads library:
  334 +<P>
  335 +<OL>
  336 +<LI>Initialize the library (<TT>st_init()</TT>).</LI>
  337 +<P>
  338 +<LI>Create resources that will be shared among different processes:
  339 + create and bind listening sockets, create shared memory segments, IPC
  340 + channels, synchronization primitives, etc.</LI>
  341 +<P>
  342 +<LI>Create several processes (<TT>fork(2)</TT>). The parent process should
  343 + either exit or become a "watchdog" (e.g., it starts a new process when
  344 + an existing one crashes, does a cleanup upon application termination,
  345 + etc.).</LI>
  346 +<P>
  347 +<LI>In each child process create a pool of threads
  348 + (<TT>st_thread_create()</TT>) to handle user connections.</LI>
  349 +</OL>
  350 +<P>
  351 +
  352 +<A NAME="nonnet">
  353 +<H3>Non-Network I/O</H3>
  354 +
  355 +The State Threads architecture uses non-blocking I/O on
  356 +<TT>st_netfd_t</TT> objects for concurrent processing of multiple user
  357 +connections. This architecture has a drawback: the entire process and
  358 +all its threads may block for the duration of a <I>disk</I> or other
  359 +non-network I/O operation, whether through State Threads I/O functions,
  360 +direct system calls, or standard I/O functions. (This is applicable
  361 +mostly to disk <I>reads</I>; disk <I>writes</I> are usually performed
  362 +asynchronously -- data goes to the buffer cache to be written to disk
  363 +later.) Fortunately, disk I/O (unlike network I/O) usually takes a
  364 +finite and predictable amount of time, but this may not be true for
  365 +special devices or user input devices (including stdin). Nevertheless,
  366 +such I/O reduces throughput of the system and increases response times.
  367 +There are several ways to design an application to overcome this
  368 +drawback:
  369 +
  370 +<P>
  371 +<UL>
  372 +<LI>Create several identical main processes as described above (symmetric
  373 + architecture). This will improve CPU utilization and thus improve the
  374 + overall throughput of the system.</LI>
  375 +<P>
  376 +<LI>Create multiple "helper" processes in addition to the main process that
  377 + will handle blocking I/O operations (asymmetric architecture).
  378 + This approach was suggested for Web servers in a
  379 + <A HREF="http://www.cs.rice.edu/~vivek/flash99/">paper</A> by Peter
  380 + Druschel et al. In this architecture the main process communicates with
  381 + a helper process via an IPC channel (<TT>pipe(2), socketpair(2)</TT>).
  382 + The main process instructs a helper to perform the potentially blocking
  383 + operation. Once the operation completes, the helper returns a
  384 + notification via IPC.
  385 +</UL>
  386 +<P>
  387 +
  388 +<A NAME="timeouts">
  389 +<H3>Timeouts</H3>
  390 +
  391 +The <TT>timeout</TT> parameter to <TT>st_cond_timedwait()</TT> and the
  392 +I/O functions, and the arguments to <TT>st_sleep()</TT> and
  393 +<TT>st_usleep()</TT> specify a maximum time to wait <I>since the last
  394 +context switch</I> not since the beginning of the function call.
  395 +
  396 +<P>The State Threads' time resolution is actually the time interval
  397 +between context switches. That time interval may be large in some
  398 +situations, for example, when a single thread does a lot of work
  399 +continuously. Note that a steady, uninterrupted stream of network I/O
  400 +qualifies for this description; a context switch occurs only when a
  401 +thread blocks.
  402 +
  403 +<P>If a specified I/O timeout is less than the time interval between
  404 +context switches the function may return with a timeout error before
  405 +that amount of time has elapsed since the beginning of the function
  406 +call. For example, if eight milliseconds have passed since the last
  407 +context switch and an I/O function with a timeout of 10 milliseconds
  408 +blocks, causing a switch, the call may return with a timeout error as
  409 +little as two milliseconds after it was called. (On Linux,
  410 +<TT>select()</TT>'s timeout is an <I>upper</I> bound on the amount of
  411 +time elapsed before select returns.) Similarly, if 12 ms have passed
  412 +already, the function may return immediately.
  413 +
  414 +<P>In almost all cases I/O timeouts should be used only for detecting a
  415 +broken network connection or for preventing a peer from holding an idle
  416 +connection for too long. Therefore for most applications realistic I/O
  417 +timeouts should be on the order of seconds. Furthermore, there's
  418 +probably no point in retrying operations that time out. Rather than
  419 +retrying simply use a larger timeout in the first place.
  420 +
  421 +<P>The largest valid timeout value is platform-dependent and may be
  422 +significantly less than <TT>INT_MAX</TT> seconds for <TT>select()</TT>
  423 +or <TT>INT_MAX</TT> milliseconds for <TT>poll()</TT>. Generally, you
  424 +should not use timeouts exceeding several hours. Use
  425 +<tt>ST_UTIME_NO_TIMEOUT</tt> (<tt>-1</tt>) as a special value to
  426 +indicate infinite timeout or indefinite sleep. Use
  427 +<tt>ST_UTIME_NO_WAIT</tt> (<tt>0</tt>) to indicate no waiting at all.
  428 +
  429 +<P>
  430 +<HR>
  431 +<P>
  432 +</BODY>
  433 +</HTML>
  434 +
此 diff 太大无法显示。
  1 +<HTML>
  2 +<HEAD>
  3 +<TITLE>State Threads for Internet Applications</TITLE>
  4 +</HEAD>
  5 +<BODY BGCOLOR=#FFFFFF>
  6 +<H2>State Threads for Internet Applications</H2>
  7 +<H3>Introduction</H3>
  8 +<P>
  9 +State Threads is an application library which provides a
  10 +foundation for writing fast and highly scalable Internet Applications
  11 +on UNIX-like platforms. It combines the simplicity of the multithreaded
  12 +programming paradigm, in which one thread supports each simultaneous
  13 +connection, with the performance and scalability of an event-driven
  14 +state machine architecture.</P>
  15 +
  16 +<H3>1. Definitions</H3>
  17 +<P>
  18 +<A NAME="IA">
  19 +<H4>1.1 Internet Applications</H4>
  20 +</A>
  21 +<P>
  22 +An <I>Internet Application</I> (IA) is either a server or client network
  23 +application that accepts connections from clients and may or may not
  24 +connect to servers. In an IA the arrival or departure of network data
  25 +often controls processing (that is, IA is a <I>data-driven</I> application).
  26 +For each connection, an IA does some finite amount of work
  27 +involving data exchange with its peer, where its peer may be either
  28 +a client or a server.
  29 +The typical transaction steps of an IA are to accept a connection,
  30 +read a request, do some finite and predictable amount of work to
  31 +process the request, then write a response to the peer that sent the
  32 +request. One example of an IA is a Web server;
  33 +the most general example of an IA is a proxy server, because it both
  34 +accepts connections from clients and connects to other servers.</P>
  35 +<P>
  36 +We assume that the performance of an IA is constrained by available CPU
  37 +cycles rather than network bandwidth or disk I/O (that is, CPU
  38 +is a bottleneck resource).
  39 +<P>
  40 +
  41 +<A NAME="PS">
  42 +<H4>1.2 Performance and Scalability</H4>
  43 +</A>
  44 +<P>
  45 +The <I>performance</I> of an IA is usually evaluated as its
  46 +throughput measured in transactions per second or bytes per second (one
  47 +can be converted to the other, given the average transaction size). There are
  48 +several benchmarks that can be used to measure throughput of Web serving
  49 +applications for specific workloads (such as
  50 +<A HREF="http://www.spec.org/osg/web96/">SPECweb96</A>,
  51 +<A HREF="http://www.mindcraft.com/webstone/">WebStone</A>,
  52 +<A HREF="http://www.zdnet.com/zdbop/webbench/">WebBench</A>).
  53 +Although there is no common definition for <I>scalability</I>, in general it
  54 +expresses the ability of an application to sustain its performance when some
  55 +external condition changes. For IAs this external condition is either the
  56 +number of clients (also known as "users," "simultaneous connections," or "load
  57 +generators") or the underlying hardware system size (number of CPUs, memory
  58 +size, and so on). Thus there are two types of scalability: <I>load
  59 +scalability</I> and <I>system scalability</I>, respectively.
  60 +<P>
  61 +The figure below shows how the throughput of an idealized IA changes with
  62 +the increasing number of clients (solid blue line). Initially the throughput
  63 +grows linearly (the slope represents the maximal throughput that one client
  64 +can provide). Within this initial range, the IA is underutilized and CPUs are
  65 +partially idle. Further increase in the number of clients leads to a system
  66 +saturation, and the throughput gradually stops growing as all CPUs become fully
  67 +utilized. After that point, the throughput stays flat because there are no
  68 +more CPU cycles available.
  69 +In the real world, however, each simultaneous connection
  70 +consumes some computational and memory resources, even when idle, and this
  71 +overhead grows with the number of clients. Therefore, the throughput of the
  72 +real world IA starts dropping after some point (dashed blue line in the figure
  73 +below). The rate at which the throughput drops depends, among other things, on
  74 +application design.
  75 +<P>
  76 +We say that an application has a good <I>load scalability</I> if it can
  77 +sustain its throughput over a wide range of loads.
  78 +Interestingly, the <A HREF="http://www.spec.org/osg/web99/">SPECweb99</A>
  79 +benchmark somewhat reflects the Web server's load scalability because it
  80 +measures the number of clients (load generators) given a mandatory minimal
  81 +throughput per client (that is, it measures the server's <I>capacity</I>).
  82 +This is unlike <A HREF="http://www.spec.org/osg/web96/">SPECweb96</A> and
  83 +other benchmarks that use the throughput as their main metric (see the figure
  84 +below).
  85 +<P>
  86 +<CENTER><IMG SRC="fig.gif" ALT="Figure: Throughput vs. Number of clients">
  87 +</CENTER>
  88 +<P>
  89 +<I>System scalability</I> is the ability of an application to sustain its
  90 +performance per hardware unit (such as a CPU) with the increasing number of
  91 +these units. In other words, good system scalability means that doubling the
  92 +number of processors will roughly double the application's throughput (dashed
  93 +green line). We assume here that the underlying operating system also scales
  94 +well. Good system scalability allows you to initially run an application on
  95 +the smallest system possible, while retaining the ability to move that
  96 +application to a larger system if necessary, without excessive effort or
  97 +expense. That is, an application need not be rewritten or even undergo a
  98 +major porting effort when changing system size.
  99 +<P>
  100 +Although scalability and performance are more important in the case of server
  101 +IAs, they should also be considered for some client applications (such as
  102 +benchmark load generators).
  103 +<P>
  104 +
  105 +<A NAME="CONC">
  106 +<H4>1.3 Concurrency</H4>
  107 +</A>
  108 +<P>
  109 +Concurrency reflects the parallelism in a system. The two unrelated types
  110 +are <I>virtual</I> concurrency and <I>real</I> concurrency.
  111 +<UL>
  112 +<LI>Virtual (or apparent) concurrency is the number of simultaneous
  113 +connections that a system supports.
  114 +<BR><BR>
  115 +<LI>Real concurrency is the number of hardware devices, including
  116 +CPUs, network cards, and disks, that actually allow a system to perform
  117 +tasks in parallel.
  118 +</UL>
  119 +<P>
  120 +An IA must provide virtual concurrency in order to serve many users
  121 +simultaneously.
  122 +To achieve maximum performance and scalability in doing so, the number of
  123 +programming entities than an IA creates to be scheduled by the OS kernel
  124 +should be
  125 +kept close to (within an order of magnitude of) the real concurrency found on
  126 +the system. These programming entities scheduled by the kernel are known as
  127 +<I>kernel execution vehicles</I>. Examples of kernel execution vehicles
  128 +include Solaris lightweight processes and IRIX kernel threads.
  129 +In other words, the number of kernel execution vehicles should be dictated by
  130 +the system size and not by the number of simultaneous connections.
  131 +<P>
  132 +
  133 +<H3>2. Existing Architectures</H3>
  134 +<P>
  135 +There are a few different architectures that are commonly used by IAs.
  136 +These include the <I>Multi-Process</I>,
  137 +<I>Multi-Threaded</I>, and <I>Event-Driven State Machine</I>
  138 +architectures.
  139 +<P>
  140 +<A NAME="MP">
  141 +<H4>2.1 Multi-Process Architecture</H4>
  142 +</A>
  143 +<P>
  144 +In the Multi-Process (MP) architecture, an individual process is
  145 +dedicated to each simultaneous connection.
  146 +A process performs all of a transaction's initialization steps
  147 +and services a connection completely before moving on to service
  148 +a new connection.
  149 +<P>
  150 +User sessions in IAs are relatively independent; therefore, no
  151 +synchronization between processes handling different connections is
  152 +necessary. Because each process has its own private address space,
  153 +this architecture is very robust. If a process serving one of the connections
  154 +crashes, the other sessions will not be affected. However, to serve many
  155 +concurrent connections, an equal number of processes must be employed.
  156 +Because processes are kernel entities (and are in fact the heaviest ones),
  157 +the number of kernel entities will be at least as large as the number of
  158 +concurrent sessions. On most systems, good performance will not be achieved
  159 +when more than a few hundred processes are created because of the high
  160 +context-switching overhead. In other words, MP applications have poor load
  161 +scalability.
  162 +<P>
  163 +On the other hand, MP applications have very good system scalability, because
  164 +no resources are shared among different processes and there is no
  165 +synchronization overhead.
  166 +<P>
  167 +The Apache Web Server 1.x (<A HREF=#refs1>[Reference 1]</A>) uses the MP
  168 +architecture on UNIX systems.
  169 +<P>
  170 +<A NAME="MT">
  171 +<H4>2.2 Multi-Threaded Architecture</H4>
  172 +</A>
  173 +<P>
  174 +In the Multi-Threaded (MT) architecture, multiple independent threads
  175 +of control are employed within a single shared address space. Like a
  176 +process in the MP architecture, each thread performs all of a
  177 +transaction's initialization steps and services a connection completely
  178 +before moving on to service a new connection.
  179 +<P>
  180 +Many modern UNIX operating systems implement a <I>many-to-few</I> model when
  181 +mapping user-level threads to kernel entities. In this model, an
  182 +arbitrarily large number of user-level threads is multiplexed onto a
  183 +lesser number of kernel execution vehicles. Kernel execution
  184 +vehicles are also known as <I>virtual processors</I>. Whenever a user-level
  185 +thread makes a blocking system call, the kernel execution vehicle it is using
  186 +will become blocked in the kernel. If there are no other non-blocked kernel
  187 +execution vehicles and there are other runnable user-level threads, a new
  188 +kernel execution vehicle will be created automatically. This prevents the
  189 +application from blocking when it can continue to make useful forward
  190 +progress.
  191 +<P>
  192 +Because IAs are by nature network I/O driven, all concurrent sessions block on
  193 +network I/O at various points. As a result, the number of virtual processors
  194 +created in the kernel grows close to the number of user-level threads
  195 +(or simultaneous connections). When this occurs, the many-to-few model
  196 +effectively degenerates to a <I>one-to-one</I> model. Again, like in
  197 +the MP architecture, the number of kernel execution vehicles is dictated by
  198 +the number of simultaneous connections rather than by number of CPUs. This
  199 +reduces an application's load scalability. However, because kernel threads
  200 +(lightweight processes) use fewer resources and are more light-weight than
  201 +traditional UNIX processes, an MT application should scale better with load
  202 +than an MP application.
  203 +<P>
  204 +Unexpectedly, the small number of virtual processors sharing the same address
  205 +space in the MT architecture destroys an application's system scalability
  206 +because of contention among the threads on various locks. Even if an
  207 +application itself is carefully
  208 +optimized to avoid lock contention around its own global data (a non-trivial
  209 +task), there are still standard library functions and system calls
  210 +that use common resources hidden from the application. For example,
  211 +on many platforms thread safety of memory allocation routines
  212 +(<TT>malloc(3)</TT>, <TT>free(3)</TT>, and so on) is achieved by using a single
  213 +global lock. Another example is a per-process file descriptor table.
  214 +This common resource table is shared by all kernel execution vehicles within
  215 +the same process and must be protected when one modifies it via
  216 +certain system calls (such as <TT>open(2)</TT>, <TT>close(2)</TT>, and so on).
  217 +In addition to that, maintaining the caches coherent
  218 +among CPUs on multiprocessor systems hurts performance when different threads
  219 +running on different CPUs modify data items on the same cache line.
  220 +<P>
  221 +In order to improve load scalability, some applications employ a different
  222 +type of MT architecture: they create one or more thread(s) <I>per task</I>
  223 +rather than one thread <I>per connection</I>. For example, one small group
  224 +of threads may be responsible for accepting client connections, another
  225 +for request processing, and yet another for serving responses. The main
  226 +advantage of this architecture is that it eliminates the tight coupling
  227 +between the number of threads and number of simultaneous connections. However,
  228 +in this architecture, different task-specific thread groups must share common
  229 +work queues that must be protected by mutual exclusion locks (a typical
  230 +producer-consumer problem). This adds synchronization overhead that causes an
  231 +application to perform badly on multiprocessor systems. In other words, in
  232 +this architecture, the application's system scalability is sacrificed for the
  233 +sake of load scalability.
  234 +<P>
  235 +Of course, the usual nightmares of threaded programming, including data
  236 +corruption, deadlocks, and race conditions, also make MT architecture (in any
  237 +form) non-simplistic to use.
  238 +<P>
  239 +
  240 +<A NAME="EDSM">
  241 +<H4>2.3 Event-Driven State Machine Architecture</H4>
  242 +</A>
  243 +<P>
  244 +In the Event-Driven State Machine (EDSM) architecture, a single process
  245 +is employed to concurrently process multiple connections. The basics of this
  246 +architecture are described in Comer and Stevens
  247 +<A HREF=#refs2>[Reference 2]</A>.
  248 +The EDSM architecture performs one basic data-driven step associated with
  249 +a particular connection at a time, thus multiplexing many concurrent
  250 +connections. The process operates as a state machine that receives an event
  251 +and then reacts to it.
  252 +<P>
  253 +In the idle state the EDSM calls <TT>select(2)</TT> or <TT>poll(2)</TT> to
  254 +wait for network I/O events. When a particular file descriptor is ready for
  255 +I/O, the EDSM completes the corresponding basic step (usually by invoking a
  256 +handler function) and starts the next one. This architecture uses
  257 +non-blocking system calls to perform asynchronous network I/O operations.
  258 +For more details on non-blocking I/O see Stevens
  259 +<A HREF=#refs3>[Reference 3]</A>.
  260 +<P>
  261 +To take advantage of hardware parallelism (real concurrency), multiple
  262 +identical processes may be created. This is called Symmetric Multi-Process
  263 +EDSM and is used, for example, in the Zeus Web Server
  264 +(<A HREF=#refs4>[Reference 4]</A>). To more efficiently multiplex disk I/O,
  265 +special "helper" processes may be created. This is called Asymmetric
  266 +Multi-Process EDSM and was proposed for Web servers by Druschel
  267 +and others <A HREF=#refs5>[Reference 5]</A>.
  268 +<P>
  269 +EDSM is probably the most scalable architecture for IAs.
  270 +Because the number of simultaneous connections (virtual concurrency) is
  271 +completely decoupled from the number of kernel execution vehicles (processes),
  272 +this architecture has very good load scalability. It requires only minimal
  273 +user-level resources to create and maintain additional connection.
  274 +<P>
  275 +Like MP applications, Multi-Process EDSM has very good system scalability
  276 +because no resources are shared among different processes and there is no
  277 +synchronization overhead.
  278 +<P>
  279 +Unfortunately, the EDSM architecture is monolithic rather than based on the
  280 +concept of threads, so new applications generally need to be implemented from
  281 +the ground up. In effect, the EDSM architecture simulates threads and their
  282 +stacks the hard way.
  283 +<P>
  284 +
  285 +<A NAME="ST">
  286 +<H3>3. State Threads Library</H3>
  287 +</A>
  288 +<P>
  289 +The State Threads library combines the advantages of all of the above
  290 +architectures. The interface preserves the programming simplicity of thread
  291 +abstraction, allowing each simultaneous connection to be treated as a separate
  292 +thread of execution within a single process. The underlying implementation is
  293 +close to the EDSM architecture as the state of each particular concurrent
  294 +session is saved in a separate memory segment.
  295 +<P>
  296 +
  297 +<H4>3.1 State Changes and Scheduling</H4>
  298 +<P>
  299 +The state of each concurrent session includes its stack environment
  300 +(stack pointer, program counter, CPU registers) and its stack. Conceptually,
  301 +a thread context switch can be viewed as a process changing its state. There
  302 +are no kernel entities involved other than processes.
  303 +Unlike other general-purpose threading libraries, the State Threads library
  304 +is fully deterministic. The thread context switch (process state change) can
  305 +only happen in a well-known set of functions (at I/O points or at explicit
  306 +synchronization points). As a result, process-specific global data does not
  307 +have to be protected by mutual exclusion locks in most cases. The entire
  308 +application is free to use all the static variables and non-reentrant library
  309 +functions it wants, greatly simplifying programming and debugging while
  310 +increasing performance. This is somewhat similar to a <I>co-routine</I> model
  311 +(co-operatively multitasked threads), except that no explicit yield is needed
  312 +--
  313 +sooner or later, a thread performs a blocking I/O operation and thus surrenders
  314 +control. All threads of execution (simultaneous connections) have the
  315 +same priority, so scheduling is non-preemptive, like in the EDSM architecture.
  316 +Because IAs are data-driven (processing is limited by the size of network
  317 +buffers and data arrival rates), scheduling is non-time-slicing.
  318 +<P>
  319 +Only two types of external events are handled by the library's
  320 +scheduler, because only these events can be detected by
  321 +<TT>select(2)</TT> or <TT>poll(2)</TT>: I/O events (a file descriptor is ready
  322 +for I/O) and time events
  323 +(some timeout has expired). However, other types of events (such as
  324 +a signal sent to a process) can also be handled by converting them to I/O
  325 +events. For example, a signal handling function can perform a write to a pipe
  326 +(<TT>write(2)</TT> is reentrant/asynchronous-safe), thus converting a signal
  327 +event to an I/O event.
  328 +<P>
  329 +To take advantage of hardware parallelism, as in the EDSM architecture,
  330 +multiple processes can be created in either a symmetric or asymmetric manner.
  331 +Process management is not in the library's scope but instead is left up to the
  332 +application.
  333 +<P>
  334 +There are several general-purpose threading libraries that implement a
  335 +<I>many-to-one</I> model (many user-level threads to one kernel execution
  336 +vehicle), using the same basic techniques as the State Threads library
  337 +(non-blocking I/O, event-driven scheduler, and so on). For an example, see GNU
  338 +Portable Threads (<A HREF=#refs6>[Reference 6]</A>). Because they are
  339 +general-purpose, these libraries have different objectives than the State
  340 +Threads library. The State Threads library is <I>not</I> a general-purpose
  341 +threading library,
  342 +but rather an application library that targets only certain types of
  343 +applications (IAs) in order to achieve the highest possible performance and
  344 +scalability for those applications.
  345 +<P>
  346 +
  347 +<H4>3.2 Scalability</H4>
  348 +<P>
  349 +State threads are very lightweight user-level entities, and therefore creating
  350 +and maintaining user connections requires minimal resources. An application
  351 +using the State Threads library scales very well with the increasing number
  352 +of connections.
  353 +<P>
  354 +On multiprocessor systems an application should create multiple processes
  355 +to take advantage of hardware parallelism. Using multiple separate processes
  356 +is the <I>only</I> way to achieve the highest possible system scalability.
  357 +This is because duplicating per-process resources is the only way to avoid
  358 +significant synchronization overhead on multiprocessor systems. Creating
  359 +separate UNIX processes naturally offers resource duplication. Again,
  360 +as in the EDSM architecture, there is no connection between the number of
  361 +simultaneous connections (which may be very large and changes within a wide
  362 +range) and the number of kernel entities (which is usually small and constant).
  363 +In other words, the State Threads library makes it possible to multiplex a
  364 +large number of simultaneous connections onto a much smaller number of
  365 +separate processes, thus allowing an application to scale well with both
  366 +the load and system size.
  367 +<P>
  368 +
  369 +<H4>3.3 Performance</H4>
  370 +<P>
  371 +Performance is one of the library's main objectives. The State Threads
  372 +library is implemented to minimize the number of system calls and
  373 +to make thread creation and context switching as fast as possible.
  374 +For example, per-thread signal mask does not exist (unlike
  375 +POSIX threads), so there is no need to save and restore a process's
  376 +signal mask on every thread context switch. This eliminates two system
  377 +calls per context switch. Signal events can be handled much more
  378 +efficiently by converting them to I/O events (see above).
  379 +<P>
  380 +
  381 +<H4>3.4 Portability</H4>
  382 +<P>
  383 +The library uses the same general, underlying concepts as the EDSM
  384 +architecture, including non-blocking I/O, file descriptors, and
  385 +I/O multiplexing. These concepts are available in some form on most
  386 +UNIX platforms, making the library very portable across many
  387 +flavors of UNIX. There are only a few platform-dependent sections in the
  388 +source.
  389 +<P>
  390 +
  391 +<H4>3.5 State Threads and NSPR</H4>
  392 +<P>
  393 +The State Threads library is a derivative of the Netscape Portable
  394 +Runtime library (NSPR) <A HREF=#refs7>[Reference 7]</A>. The primary goal of
  395 +NSPR is to provide a platform-independent layer for system facilities,
  396 +where system facilities include threads, thread synchronization, and I/O.
  397 +Performance and scalability are not the main concern of NSPR. The
  398 +State Threads library addresses performance and scalability while
  399 +remaining much smaller than NSPR. It is contained in 8 source files
  400 +as opposed to more than 400, but provides all the functionality that
  401 +is needed to write efficient IAs on UNIX-like platforms.
  402 +<P>
  403 +
  404 +<TABLE CELLPADDING=3>
  405 +<TR>
  406 +<TD></TD>
  407 +<TH>NSPR</TH>
  408 +<TH>State Threads</TH>
  409 +</TR>
  410 +<TR>
  411 +<TD><B>Lines of code</B></TD>
  412 +<TD ALIGN=RIGHT>~150,000</TD>
  413 +<TD ALIGN=RIGHT>~3000</TD>
  414 +</TR>
  415 +<TR>
  416 +<TD><B>Dynamic library size&nbsp;&nbsp;<BR>(debug version)</B></TD>
  417 +<TD></TD>
  418 +<TD></TD>
  419 +</TR>
  420 +<TR>
  421 +<TD>IRIX</TD>
  422 +<TD ALIGN=RIGHT>~700 KB</TD>
  423 +<TD ALIGN=RIGHT>~60 KB</TD>
  424 +</TR>
  425 +<TR>
  426 +<TD>Linux</TD>
  427 +<TD ALIGN=RIGHT>~900 KB</TD>
  428 +<TD ALIGN=RIGHT>~70 KB</TD>
  429 +</TR>
  430 +</TABLE>
  431 +<P>
  432 +
  433 +<H3>Conclusion</H3>
  434 +<P>
  435 +State Threads is an application library which provides a foundation for
  436 +writing <A HREF=#IA>Internet Applications</A>. To summarize, it has the
  437 +following <I>advantages</I>:
  438 +<P>
  439 +<UL>
  440 +<LI>It allows the design of fast and highly scalable applications. An
  441 +application will scale well with both load and number of CPUs.
  442 +<P>
  443 +<LI>It greatly simplifies application programming and debugging because, as a
  444 +rule, no mutual exclusion locking is necessary and the entire application is
  445 +free to use static variables and non-reentrant library functions.
  446 +</UL>
  447 +<P>
  448 +The library's main <I>limitation</I>:
  449 +<P>
  450 +<UL>
  451 +<LI>All I/O operations on sockets must use the State Thread library's I/O
  452 +functions because only those functions perform thread scheduling and prevent
  453 +the application's processes from blocking.
  454 +</UL>
  455 +<P>
  456 +
  457 +<H3>References</H3>
  458 +<OL>
  459 +<A NAME="refs1">
  460 +<LI> Apache Software Foundation,
  461 +<A HREF="http://www.apache.org">http://www.apache.org</A>.
  462 +<A NAME="refs2">
  463 +<LI> Douglas E. Comer, David L. Stevens, <I>Internetworking With TCP/IP,
  464 +Vol. III: Client-Server Programming And Applications</I>, Second Edition,
  465 +Ch. 8, 12.
  466 +<A NAME="refs3">
  467 +<LI> W. Richard Stevens, <I>UNIX Network Programming</I>, Second Edition,
  468 +Vol. 1, Ch. 15.
  469 +<A NAME="refs4">
  470 +<LI> Zeus Technology Limited,
  471 +<A HREF="http://www.zeus.co.uk/">http://www.zeus.co.uk</A>.
  472 +<A NAME="refs5">
  473 +<LI> Peter Druschel, Vivek S. Pai, Willy Zwaenepoel,
  474 +<A HREF="http://www.cs.rice.edu/~druschel/usenix99flash.ps.gz">
  475 +Flash: An Efficient and Portable Web Server</A>. In <I>Proceedings of the
  476 +USENIX 1999 Annual Technical Conference</I>, Monterey, CA, June 1999.
  477 +<A NAME="refs6">
  478 +<LI> GNU Portable Threads,
  479 +<A HREF="http://www.gnu.org/software/pth/">http://www.gnu.org/software/pth/</A>.
  480 +<A NAME="refs7">
  481 +<LI> Netscape Portable Runtime,
  482 +<A HREF="http://www.mozilla.org/docs/refList/refNSPR/">http://www.mozilla.org/docs/refList/refNSPR/</A>.
  483 +</OL>
  484 +
  485 +<H3>Other resources covering various architectural issues in IAs</H3>
  486 +<OL START=8>
  487 +<LI> Dan Kegel, <I>The C10K problem</I>,
  488 +<A HREF="http://www.kegel.com/c10k.html">http://www.kegel.com/c10k.html</A>.
  489 +</LI>
  490 +<LI> James C. Hu, Douglas C. Schmidt, Irfan Pyarali, <I>JAWS: Understanding
  491 +High Performance Web Systems</I>,
  492 +<A HREF="http://www.cs.wustl.edu/~jxh/research/research.html">http://www.cs.wustl.edu/~jxh/research/research.html</A>.</LI>
  493 +</OL>
  494 +<P>
  495 +<HR>
  496 +<P>
  497 +
  498 +<CENTER><FONT SIZE=-1>Portions created by SGI are Copyright &copy; 2000
  499 +Silicon Graphics, Inc. All rights reserved.</FONT></CENTER>
  500 +<P>
  501 +
  502 +</BODY>
  503 +</HTML>
  504 +
  1 +How the timeout heap works
  2 +
  3 +As of version 1.5, the State Threads Library represents the queue of
  4 +sleeping threads using a heap data structure rather than a sorted
  5 +linked list. This improves performance when there is a large number
  6 +of sleeping threads, since insertion into a heap takes O(log N) time
  7 +while insertion into a sorted list takes O(N) time. For example, in
  8 +one test 1000 threads were created, each thread called st_usleep()
  9 +with a random time interval, and then all the threads where
  10 +immediately interrupted and joined before the sleeps had a chance to
  11 +finish. The whole process was repeated 1000 times, for a total of a
  12 +million sleep queue insertions and removals. With the old list-based
  13 +sleep queue, this test took 100 seconds; now it takes only 12 seconds.
  14 +
  15 +Heap data structures are typically based on dynamically resized
  16 +arrays. However, since the existing ST code base was very nicely
  17 +structured around linking the thread objects into pointer-based lists
  18 +without the need for any auxiliary data structures, implementing the
  19 +heap using a similar nodes-and-pointers based approach seemed more
  20 +appropriate for ST than introducing a separate array.
  21 +
  22 +Thus, the new ST timeout heap works by organizing the existing
  23 +_st_thread_t objects in a balanced binary tree, just as they were
  24 +previously organized into a doubly-linked, sorted list. The global
  25 +_ST_SLEEPQ variable, formerly a linked list head, is now simply a
  26 +pointer to the root of this tree, and the root node of the tree is the
  27 +thread with the earliest timeout. Each thread object has two child
  28 +pointers, "left" and "right", pointing to threads with later timeouts.
  29 +
  30 +Each node in the tree is numbered with an integer index, corresponding
  31 +to the array index in an array-based heap, and the tree is kept fully
  32 +balanced and left-adjusted at all times. In other words, the tree
  33 +consists of any number of fully populated top levels, followed by a
  34 +single bottom level which may be partially populated, such that any
  35 +existing nodes form a contiguous block to the left and the spaces for
  36 +missing nodes form a contiguous block to the right. For example, if
  37 +there are nine threads waiting for a timeout, they are numbered and
  38 +arranged in a tree exactly as follows:
  39 +
  40 + 1
  41 + / \
  42 + 2 3
  43 + / \ / \
  44 + 4 5 6 7
  45 + / \
  46 + 8 9
  47 +
  48 +Each node has either no children, only a left child, or both a left
  49 +and a right child. Children always time out later than their parents
  50 +(this is called the "heap invariant"), but when a node has two
  51 +children, their mutual order is unspecified - the left child may time
  52 +out before or after the right child. If a node is numbered N, its
  53 +left child is numbered 2N, and its right child is numbered 2N+1.
  54 +
  55 +There is no pointer from a child to its parent; all pointers point
  56 +downward. Additions and deletions both work by starting at the root
  57 +and traversing the tree towards the leaves, going left or right
  58 +according to the binary digits forming the index of the destination
  59 +node. As nodes are added or deleted, existing nodes are rearranged to
  60 +maintain the heap invariant.
  1 +/*
  2 + * The contents of this file are subject to the Mozilla Public
  3 + * License Version 1.1 (the "License"); you may not use this file
  4 + * except in compliance with the License. You may obtain a copy of
  5 + * the License at http://www.mozilla.org/MPL/
  6 + *
  7 + * Software distributed under the License is distributed on an "AS
  8 + * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
  9 + * implied. See the License for the specific language governing
  10 + * rights and limitations under the License.
  11 + *
  12 + * The Original Code is the Netscape Portable Runtime library.
  13 + *
  14 + * The Initial Developer of the Original Code is Netscape
  15 + * Communications Corporation. Portions created by Netscape are
  16 + * Copyright (C) 1994-2000 Netscape Communications Corporation. All
  17 + * Rights Reserved.
  18 + *
  19 + * Contributor(s): Silicon Graphics, Inc.
  20 + * Yahoo! Inc.
  21 + *
  22 + * Alternatively, the contents of this file may be used under the
  23 + * terms of the GNU General Public License Version 2 or later (the
  24 + * "GPL"), in which case the provisions of the GPL are applicable
  25 + * instead of those above. If you wish to allow use of your
  26 + * version of this file only under the terms of the GPL and not to
  27 + * allow others to use your version of this file under the MPL,
  28 + * indicate your decision by deleting the provisions above and
  29 + * replace them with the notice and other provisions required by
  30 + * the GPL. If you do not delete the provisions above, a recipient
  31 + * may use your version of this file under either the MPL or the
  32 + * GPL.
  33 + */
  34 +
  35 +#include <stdlib.h>
  36 +#include <unistd.h>
  37 +#include <fcntl.h>
  38 +#include <string.h>
  39 +#include <time.h>
  40 +#include <errno.h>
  41 +#include "common.h"
  42 +
  43 +#ifdef MD_HAVE_KQUEUE
  44 +#include <sys/event.h>
  45 +#endif
  46 +#ifdef MD_HAVE_EPOLL
  47 +#include <sys/epoll.h>
  48 +#endif
  49 +
  50 +#if defined(USE_POLL) && !defined(MD_HAVE_POLL)
  51 +/* Force poll usage if explicitly asked for it */
  52 +#define MD_HAVE_POLL
  53 +#endif
  54 +
  55 +
  56 +static struct _st_seldata {
  57 + fd_set fd_read_set, fd_write_set, fd_exception_set;
  58 + int fd_ref_cnts[FD_SETSIZE][3];
  59 + int maxfd;
  60 +} *_st_select_data;
  61 +
  62 +#define _ST_SELECT_MAX_OSFD (_st_select_data->maxfd)
  63 +#define _ST_SELECT_READ_SET (_st_select_data->fd_read_set)
  64 +#define _ST_SELECT_WRITE_SET (_st_select_data->fd_write_set)
  65 +#define _ST_SELECT_EXCEP_SET (_st_select_data->fd_exception_set)
  66 +#define _ST_SELECT_READ_CNT(fd) (_st_select_data->fd_ref_cnts[fd][0])
  67 +#define _ST_SELECT_WRITE_CNT(fd) (_st_select_data->fd_ref_cnts[fd][1])
  68 +#define _ST_SELECT_EXCEP_CNT(fd) (_st_select_data->fd_ref_cnts[fd][2])
  69 +
  70 +
  71 +#ifdef MD_HAVE_POLL
  72 +static struct _st_polldata {
  73 + struct pollfd *pollfds;
  74 + int pollfds_size;
  75 + int fdcnt;
  76 +} *_st_poll_data;
  77 +
  78 +#define _ST_POLL_OSFD_CNT (_st_poll_data->fdcnt)
  79 +#define _ST_POLLFDS (_st_poll_data->pollfds)
  80 +#define _ST_POLLFDS_SIZE (_st_poll_data->pollfds_size)
  81 +#endif /* MD_HAVE_POLL */
  82 +
  83 +
  84 +#ifdef MD_HAVE_KQUEUE
  85 +typedef struct _kq_fd_data {
  86 + int rd_ref_cnt;
  87 + int wr_ref_cnt;
  88 + int revents;
  89 +} _kq_fd_data_t;
  90 +
  91 +static struct _st_kqdata {
  92 + _kq_fd_data_t *fd_data;
  93 + struct kevent *evtlist;
  94 + struct kevent *addlist;
  95 + struct kevent *dellist;
  96 + int fd_data_size;
  97 + int evtlist_size;
  98 + int addlist_size;
  99 + int addlist_cnt;
  100 + int dellist_size;
  101 + int dellist_cnt;
  102 + int kq;
  103 + pid_t pid;
  104 +} *_st_kq_data;
  105 +
  106 +#ifndef ST_KQ_MIN_EVTLIST_SIZE
  107 +#define ST_KQ_MIN_EVTLIST_SIZE 64
  108 +#endif
  109 +
  110 +#define _ST_KQ_READ_CNT(fd) (_st_kq_data->fd_data[fd].rd_ref_cnt)
  111 +#define _ST_KQ_WRITE_CNT(fd) (_st_kq_data->fd_data[fd].wr_ref_cnt)
  112 +#define _ST_KQ_REVENTS(fd) (_st_kq_data->fd_data[fd].revents)
  113 +#endif /* MD_HAVE_KQUEUE */
  114 +
  115 +
  116 +#ifdef MD_HAVE_EPOLL
  117 +typedef struct _epoll_fd_data {
  118 + int rd_ref_cnt;
  119 + int wr_ref_cnt;
  120 + int ex_ref_cnt;
  121 + int revents;
  122 +} _epoll_fd_data_t;
  123 +
  124 +static struct _st_epolldata {
  125 + _epoll_fd_data_t *fd_data;
  126 + struct epoll_event *evtlist;
  127 + int fd_data_size;
  128 + int evtlist_size;
  129 + int evtlist_cnt;
  130 + int fd_hint;
  131 + int epfd;
  132 + pid_t pid;
  133 +} *_st_epoll_data;
  134 +
  135 +#ifndef ST_EPOLL_EVTLIST_SIZE
  136 +/* Not a limit, just a hint */
  137 +#define ST_EPOLL_EVTLIST_SIZE 4096
  138 +#endif
  139 +
  140 +#define _ST_EPOLL_READ_CNT(fd) (_st_epoll_data->fd_data[fd].rd_ref_cnt)
  141 +#define _ST_EPOLL_WRITE_CNT(fd) (_st_epoll_data->fd_data[fd].wr_ref_cnt)
  142 +#define _ST_EPOLL_EXCEP_CNT(fd) (_st_epoll_data->fd_data[fd].ex_ref_cnt)
  143 +#define _ST_EPOLL_REVENTS(fd) (_st_epoll_data->fd_data[fd].revents)
  144 +
  145 +#define _ST_EPOLL_READ_BIT(fd) (_ST_EPOLL_READ_CNT(fd) ? EPOLLIN : 0)
  146 +#define _ST_EPOLL_WRITE_BIT(fd) (_ST_EPOLL_WRITE_CNT(fd) ? EPOLLOUT : 0)
  147 +#define _ST_EPOLL_EXCEP_BIT(fd) (_ST_EPOLL_EXCEP_CNT(fd) ? EPOLLPRI : 0)
  148 +#define _ST_EPOLL_EVENTS(fd) \
  149 + (_ST_EPOLL_READ_BIT(fd)|_ST_EPOLL_WRITE_BIT(fd)|_ST_EPOLL_EXCEP_BIT(fd))
  150 +
  151 +#endif /* MD_HAVE_EPOLL */
  152 +
  153 +_st_eventsys_t *_st_eventsys = NULL;
  154 +
  155 +
  156 +/*****************************************
  157 + * select event system
  158 + */
  159 +
  160 +ST_HIDDEN int _st_select_init(void)
  161 +{
  162 + _st_select_data = (struct _st_seldata *) malloc(sizeof(*_st_select_data));
  163 + if (!_st_select_data)
  164 + return -1;
  165 +
  166 + memset(_st_select_data, 0, sizeof(*_st_select_data));
  167 + _st_select_data->maxfd = -1;
  168 +
  169 + return 0;
  170 +}
  171 +
  172 +ST_HIDDEN int _st_select_pollset_add(struct pollfd *pds, int npds)
  173 +{
  174 + struct pollfd *pd;
  175 + struct pollfd *epd = pds + npds;
  176 +
  177 + /* Do checks up front */
  178 + for (pd = pds; pd < epd; pd++) {
  179 + if (pd->fd < 0 || pd->fd >= FD_SETSIZE || !pd->events ||
  180 + (pd->events & ~(POLLIN | POLLOUT | POLLPRI))) {
  181 + errno = EINVAL;
  182 + return -1;
  183 + }
  184 + }
  185 +
  186 + for (pd = pds; pd < epd; pd++) {
  187 + if (pd->events & POLLIN) {
  188 + FD_SET(pd->fd, &_ST_SELECT_READ_SET);
  189 + _ST_SELECT_READ_CNT(pd->fd)++;
  190 + }
  191 + if (pd->events & POLLOUT) {
  192 + FD_SET(pd->fd, &_ST_SELECT_WRITE_SET);
  193 + _ST_SELECT_WRITE_CNT(pd->fd)++;
  194 + }
  195 + if (pd->events & POLLPRI) {
  196 + FD_SET(pd->fd, &_ST_SELECT_EXCEP_SET);
  197 + _ST_SELECT_EXCEP_CNT(pd->fd)++;
  198 + }
  199 + if (_ST_SELECT_MAX_OSFD < pd->fd)
  200 + _ST_SELECT_MAX_OSFD = pd->fd;
  201 + }
  202 +
  203 + return 0;
  204 +}
  205 +
  206 +ST_HIDDEN void _st_select_pollset_del(struct pollfd *pds, int npds)
  207 +{
  208 + struct pollfd *pd;
  209 + struct pollfd *epd = pds + npds;
  210 +
  211 + for (pd = pds; pd < epd; pd++) {
  212 + if (pd->events & POLLIN) {
  213 + if (--_ST_SELECT_READ_CNT(pd->fd) == 0)
  214 + FD_CLR(pd->fd, &_ST_SELECT_READ_SET);
  215 + }
  216 + if (pd->events & POLLOUT) {
  217 + if (--_ST_SELECT_WRITE_CNT(pd->fd) == 0)
  218 + FD_CLR(pd->fd, &_ST_SELECT_WRITE_SET);
  219 + }
  220 + if (pd->events & POLLPRI) {
  221 + if (--_ST_SELECT_EXCEP_CNT(pd->fd) == 0)
  222 + FD_CLR(pd->fd, &_ST_SELECT_EXCEP_SET);
  223 + }
  224 + }
  225 +}
  226 +
  227 +ST_HIDDEN void _st_select_find_bad_fd(void)
  228 +{
  229 + _st_clist_t *q;
  230 + _st_pollq_t *pq;
  231 + int notify;
  232 + struct pollfd *pds, *epds;
  233 + int pq_max_osfd, osfd;
  234 + short events;
  235 +
  236 + _ST_SELECT_MAX_OSFD = -1;
  237 +
  238 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  239 + pq = _ST_POLLQUEUE_PTR(q);
  240 + notify = 0;
  241 + epds = pq->pds + pq->npds;
  242 + pq_max_osfd = -1;
  243 +
  244 + for (pds = pq->pds; pds < epds; pds++) {
  245 + osfd = pds->fd;
  246 + pds->revents = 0;
  247 + if (pds->events == 0)
  248 + continue;
  249 + if (fcntl(osfd, F_GETFL, 0) < 0) {
  250 + pds->revents = POLLNVAL;
  251 + notify = 1;
  252 + }
  253 + if (osfd > pq_max_osfd) {
  254 + pq_max_osfd = osfd;
  255 + }
  256 + }
  257 +
  258 + if (notify) {
  259 + ST_REMOVE_LINK(&pq->links);
  260 + pq->on_ioq = 0;
  261 + /*
  262 + * Decrement the count of descriptors for each descriptor/event
  263 + * because this I/O request is being removed from the ioq
  264 + */
  265 + for (pds = pq->pds; pds < epds; pds++) {
  266 + osfd = pds->fd;
  267 + events = pds->events;
  268 + if (events & POLLIN) {
  269 + if (--_ST_SELECT_READ_CNT(osfd) == 0) {
  270 + FD_CLR(osfd, &_ST_SELECT_READ_SET);
  271 + }
  272 + }
  273 + if (events & POLLOUT) {
  274 + if (--_ST_SELECT_WRITE_CNT(osfd) == 0) {
  275 + FD_CLR(osfd, &_ST_SELECT_WRITE_SET);
  276 + }
  277 + }
  278 + if (events & POLLPRI) {
  279 + if (--_ST_SELECT_EXCEP_CNT(osfd) == 0) {
  280 + FD_CLR(osfd, &_ST_SELECT_EXCEP_SET);
  281 + }
  282 + }
  283 + }
  284 +
  285 + if (pq->thread->flags & _ST_FL_ON_SLEEPQ)
  286 + _ST_DEL_SLEEPQ(pq->thread);
  287 + pq->thread->state = _ST_ST_RUNNABLE;
  288 + _ST_ADD_RUNQ(pq->thread);
  289 + } else {
  290 + if (_ST_SELECT_MAX_OSFD < pq_max_osfd)
  291 + _ST_SELECT_MAX_OSFD = pq_max_osfd;
  292 + }
  293 + }
  294 +}
  295 +
  296 +ST_HIDDEN void _st_select_dispatch(void)
  297 +{
  298 + struct timeval timeout, *tvp;
  299 + fd_set r, w, e;
  300 + fd_set *rp, *wp, *ep;
  301 + int nfd, pq_max_osfd, osfd;
  302 + _st_clist_t *q;
  303 + st_utime_t min_timeout;
  304 + _st_pollq_t *pq;
  305 + int notify;
  306 + struct pollfd *pds, *epds;
  307 + short events, revents;
  308 +
  309 + /*
  310 + * Assignment of fd_sets
  311 + */
  312 + r = _ST_SELECT_READ_SET;
  313 + w = _ST_SELECT_WRITE_SET;
  314 + e = _ST_SELECT_EXCEP_SET;
  315 +
  316 + rp = &r;
  317 + wp = &w;
  318 + ep = &e;
  319 +
  320 + if (_ST_SLEEPQ == NULL) {
  321 + tvp = NULL;
  322 + } else {
  323 + min_timeout = (_ST_SLEEPQ->due <= _ST_LAST_CLOCK) ? 0 :
  324 + (_ST_SLEEPQ->due - _ST_LAST_CLOCK);
  325 + timeout.tv_sec = (int) (min_timeout / 1000000);
  326 + timeout.tv_usec = (int) (min_timeout % 1000000);
  327 + tvp = &timeout;
  328 + }
  329 +
  330 + /* Check for I/O operations */
  331 + nfd = select(_ST_SELECT_MAX_OSFD + 1, rp, wp, ep, tvp);
  332 +
  333 + /* Notify threads that are associated with the selected descriptors */
  334 + if (nfd > 0) {
  335 + _ST_SELECT_MAX_OSFD = -1;
  336 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  337 + pq = _ST_POLLQUEUE_PTR(q);
  338 + notify = 0;
  339 + epds = pq->pds + pq->npds;
  340 + pq_max_osfd = -1;
  341 +
  342 + for (pds = pq->pds; pds < epds; pds++) {
  343 + osfd = pds->fd;
  344 + events = pds->events;
  345 + revents = 0;
  346 + if ((events & POLLIN) && FD_ISSET(osfd, rp)) {
  347 + revents |= POLLIN;
  348 + }
  349 + if ((events & POLLOUT) && FD_ISSET(osfd, wp)) {
  350 + revents |= POLLOUT;
  351 + }
  352 + if ((events & POLLPRI) && FD_ISSET(osfd, ep)) {
  353 + revents |= POLLPRI;
  354 + }
  355 + pds->revents = revents;
  356 + if (revents) {
  357 + notify = 1;
  358 + }
  359 + if (osfd > pq_max_osfd) {
  360 + pq_max_osfd = osfd;
  361 + }
  362 + }
  363 + if (notify) {
  364 + ST_REMOVE_LINK(&pq->links);
  365 + pq->on_ioq = 0;
  366 + /*
  367 + * Decrement the count of descriptors for each descriptor/event
  368 + * because this I/O request is being removed from the ioq
  369 + */
  370 + for (pds = pq->pds; pds < epds; pds++) {
  371 + osfd = pds->fd;
  372 + events = pds->events;
  373 + if (events & POLLIN) {
  374 + if (--_ST_SELECT_READ_CNT(osfd) == 0) {
  375 + FD_CLR(osfd, &_ST_SELECT_READ_SET);
  376 + }
  377 + }
  378 + if (events & POLLOUT) {
  379 + if (--_ST_SELECT_WRITE_CNT(osfd) == 0) {
  380 + FD_CLR(osfd, &_ST_SELECT_WRITE_SET);
  381 + }
  382 + }
  383 + if (events & POLLPRI) {
  384 + if (--_ST_SELECT_EXCEP_CNT(osfd) == 0) {
  385 + FD_CLR(osfd, &_ST_SELECT_EXCEP_SET);
  386 + }
  387 + }
  388 + }
  389 +
  390 + if (pq->thread->flags & _ST_FL_ON_SLEEPQ)
  391 + _ST_DEL_SLEEPQ(pq->thread);
  392 + pq->thread->state = _ST_ST_RUNNABLE;
  393 + _ST_ADD_RUNQ(pq->thread);
  394 + } else {
  395 + if (_ST_SELECT_MAX_OSFD < pq_max_osfd)
  396 + _ST_SELECT_MAX_OSFD = pq_max_osfd;
  397 + }
  398 + }
  399 + } else if (nfd < 0) {
  400 + /*
  401 + * It can happen when a thread closes file descriptor
  402 + * that is being used by some other thread -- BAD!
  403 + */
  404 + if (errno == EBADF)
  405 + _st_select_find_bad_fd();
  406 + }
  407 +}
  408 +
  409 +ST_HIDDEN int _st_select_fd_new(int osfd)
  410 +{
  411 + if (osfd >= FD_SETSIZE) {
  412 + errno = EMFILE;
  413 + return -1;
  414 + }
  415 +
  416 + return 0;
  417 +}
  418 +
  419 +ST_HIDDEN int _st_select_fd_close(int osfd)
  420 +{
  421 + if (_ST_SELECT_READ_CNT(osfd) || _ST_SELECT_WRITE_CNT(osfd) ||
  422 + _ST_SELECT_EXCEP_CNT(osfd)) {
  423 + errno = EBUSY;
  424 + return -1;
  425 + }
  426 +
  427 + return 0;
  428 +}
  429 +
  430 +ST_HIDDEN int _st_select_fd_getlimit(void)
  431 +{
  432 + return FD_SETSIZE;
  433 +}
  434 +
  435 +static _st_eventsys_t _st_select_eventsys = {
  436 + "select",
  437 + ST_EVENTSYS_SELECT,
  438 + _st_select_init,
  439 + _st_select_dispatch,
  440 + _st_select_pollset_add,
  441 + _st_select_pollset_del,
  442 + _st_select_fd_new,
  443 + _st_select_fd_close,
  444 + _st_select_fd_getlimit
  445 +};
  446 +
  447 +
  448 +#ifdef MD_HAVE_POLL
  449 +/*****************************************
  450 + * poll event system
  451 + */
  452 +
  453 +ST_HIDDEN int _st_poll_init(void)
  454 +{
  455 + _st_poll_data = (struct _st_polldata *) malloc(sizeof(*_st_poll_data));
  456 + if (!_st_poll_data)
  457 + return -1;
  458 +
  459 + _ST_POLLFDS = (struct pollfd *) malloc(ST_MIN_POLLFDS_SIZE *
  460 + sizeof(struct pollfd));
  461 + if (!_ST_POLLFDS) {
  462 + free(_st_poll_data);
  463 + _st_poll_data = NULL;
  464 + return -1;
  465 + }
  466 + _ST_POLLFDS_SIZE = ST_MIN_POLLFDS_SIZE;
  467 + _ST_POLL_OSFD_CNT = 0;
  468 +
  469 + return 0;
  470 +}
  471 +
  472 +ST_HIDDEN int _st_poll_pollset_add(struct pollfd *pds, int npds)
  473 +{
  474 + struct pollfd *pd;
  475 + struct pollfd *epd = pds + npds;
  476 +
  477 + for (pd = pds; pd < epd; pd++) {
  478 + if (pd->fd < 0 || !pd->events) {
  479 + errno = EINVAL;
  480 + return -1;
  481 + }
  482 + }
  483 +
  484 + _ST_POLL_OSFD_CNT += npds;
  485 +
  486 + return 0;
  487 +}
  488 +
  489 +/* ARGSUSED */
  490 +ST_HIDDEN void _st_poll_pollset_del(struct pollfd *pds, int npds)
  491 +{
  492 + _ST_POLL_OSFD_CNT -= npds;
  493 + ST_ASSERT(_ST_POLL_OSFD_CNT >= 0);
  494 +}
  495 +
  496 +ST_HIDDEN void _st_poll_dispatch(void)
  497 +{
  498 + int timeout, nfd;
  499 + _st_clist_t *q;
  500 + st_utime_t min_timeout;
  501 + _st_pollq_t *pq;
  502 + struct pollfd *pds, *epds, *pollfds;
  503 +
  504 + /*
  505 + * Build up the array of struct pollfd to wait on.
  506 + * If existing array is not big enough, release it and allocate a new one.
  507 + */
  508 + ST_ASSERT(_ST_POLL_OSFD_CNT >= 0);
  509 + if (_ST_POLL_OSFD_CNT > _ST_POLLFDS_SIZE) {
  510 + free(_ST_POLLFDS);
  511 + _ST_POLLFDS = (struct pollfd *) malloc((_ST_POLL_OSFD_CNT + 10) *
  512 + sizeof(struct pollfd));
  513 + ST_ASSERT(_ST_POLLFDS != NULL);
  514 + _ST_POLLFDS_SIZE = _ST_POLL_OSFD_CNT + 10;
  515 + }
  516 + pollfds = _ST_POLLFDS;
  517 +
  518 + /* Gather all descriptors into one array */
  519 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  520 + pq = _ST_POLLQUEUE_PTR(q);
  521 + memcpy(pollfds, pq->pds, sizeof(struct pollfd) * pq->npds);
  522 + pollfds += pq->npds;
  523 + }
  524 + ST_ASSERT(pollfds <= _ST_POLLFDS + _ST_POLLFDS_SIZE);
  525 +
  526 + if (_ST_SLEEPQ == NULL) {
  527 + timeout = -1;
  528 + } else {
  529 + min_timeout = (_ST_SLEEPQ->due <= _ST_LAST_CLOCK) ? 0 :
  530 + (_ST_SLEEPQ->due - _ST_LAST_CLOCK);
  531 + timeout = (int) (min_timeout / 1000);
  532 + }
  533 +
  534 + /* Check for I/O operations */
  535 + nfd = poll(_ST_POLLFDS, _ST_POLL_OSFD_CNT, timeout);
  536 +
  537 + /* Notify threads that are associated with the selected descriptors */
  538 + if (nfd > 0) {
  539 + pollfds = _ST_POLLFDS;
  540 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  541 + pq = _ST_POLLQUEUE_PTR(q);
  542 + epds = pollfds + pq->npds;
  543 + for (pds = pollfds; pds < epds; pds++) {
  544 + if (pds->revents)
  545 + break;
  546 + }
  547 + if (pds < epds) {
  548 + memcpy(pq->pds, pollfds, sizeof(struct pollfd) * pq->npds);
  549 + ST_REMOVE_LINK(&pq->links);
  550 + pq->on_ioq = 0;
  551 +
  552 + if (pq->thread->flags & _ST_FL_ON_SLEEPQ)
  553 + _ST_DEL_SLEEPQ(pq->thread);
  554 + pq->thread->state = _ST_ST_RUNNABLE;
  555 + _ST_ADD_RUNQ(pq->thread);
  556 +
  557 + _ST_POLL_OSFD_CNT -= pq->npds;
  558 + ST_ASSERT(_ST_POLL_OSFD_CNT >= 0);
  559 + }
  560 + pollfds = epds;
  561 + }
  562 + }
  563 +}
  564 +
  565 +/* ARGSUSED */
  566 +ST_HIDDEN int _st_poll_fd_new(int osfd)
  567 +{
  568 + return 0;
  569 +}
  570 +
  571 +/* ARGSUSED */
  572 +ST_HIDDEN int _st_poll_fd_close(int osfd)
  573 +{
  574 + /*
  575 + * We don't maintain I/O counts for poll event system
  576 + * so nothing to check here.
  577 + */
  578 + return 0;
  579 +}
  580 +
  581 +ST_HIDDEN int _st_poll_fd_getlimit(void)
  582 +{
  583 + /* zero means no specific limit */
  584 + return 0;
  585 +}
  586 +
  587 +static _st_eventsys_t _st_poll_eventsys = {
  588 + "poll",
  589 + ST_EVENTSYS_POLL,
  590 + _st_poll_init,
  591 + _st_poll_dispatch,
  592 + _st_poll_pollset_add,
  593 + _st_poll_pollset_del,
  594 + _st_poll_fd_new,
  595 + _st_poll_fd_close,
  596 + _st_poll_fd_getlimit
  597 +};
  598 +#endif /* MD_HAVE_POLL */
  599 +
  600 +
  601 +#ifdef MD_HAVE_KQUEUE
  602 +/*****************************************
  603 + * kqueue event system
  604 + */
  605 +
  606 +ST_HIDDEN int _st_kq_init(void)
  607 +{
  608 + int err = 0;
  609 + int rv = 0;
  610 +
  611 + _st_kq_data = (struct _st_kqdata *) calloc(1, sizeof(*_st_kq_data));
  612 + if (!_st_kq_data)
  613 + return -1;
  614 +
  615 + if ((_st_kq_data->kq = kqueue()) < 0) {
  616 + err = errno;
  617 + rv = -1;
  618 + goto cleanup_kq;
  619 + }
  620 + fcntl(_st_kq_data->kq, F_SETFD, FD_CLOEXEC);
  621 + _st_kq_data->pid = getpid();
  622 +
  623 + /*
  624 + * Allocate file descriptor data array.
  625 + * FD_SETSIZE looks like good initial size.
  626 + */
  627 + _st_kq_data->fd_data_size = FD_SETSIZE;
  628 + _st_kq_data->fd_data = (_kq_fd_data_t *)calloc(_st_kq_data->fd_data_size,
  629 + sizeof(_kq_fd_data_t));
  630 + if (!_st_kq_data->fd_data) {
  631 + err = errno;
  632 + rv = -1;
  633 + goto cleanup_kq;
  634 + }
  635 +
  636 + /* Allocate event lists */
  637 + _st_kq_data->evtlist_size = ST_KQ_MIN_EVTLIST_SIZE;
  638 + _st_kq_data->evtlist = (struct kevent *)malloc(_st_kq_data->evtlist_size *
  639 + sizeof(struct kevent));
  640 + _st_kq_data->addlist_size = ST_KQ_MIN_EVTLIST_SIZE;
  641 + _st_kq_data->addlist = (struct kevent *)malloc(_st_kq_data->addlist_size *
  642 + sizeof(struct kevent));
  643 + _st_kq_data->dellist_size = ST_KQ_MIN_EVTLIST_SIZE;
  644 + _st_kq_data->dellist = (struct kevent *)malloc(_st_kq_data->dellist_size *
  645 + sizeof(struct kevent));
  646 + if (!_st_kq_data->evtlist || !_st_kq_data->addlist ||
  647 + !_st_kq_data->dellist) {
  648 + err = ENOMEM;
  649 + rv = -1;
  650 + }
  651 +
  652 + cleanup_kq:
  653 + if (rv < 0) {
  654 + if (_st_kq_data->kq >= 0)
  655 + close(_st_kq_data->kq);
  656 + free(_st_kq_data->fd_data);
  657 + free(_st_kq_data->evtlist);
  658 + free(_st_kq_data->addlist);
  659 + free(_st_kq_data->dellist);
  660 + free(_st_kq_data);
  661 + _st_kq_data = NULL;
  662 + errno = err;
  663 + }
  664 +
  665 + return rv;
  666 +}
  667 +
  668 +ST_HIDDEN int _st_kq_fd_data_expand(int maxfd)
  669 +{
  670 + _kq_fd_data_t *ptr;
  671 + int n = _st_kq_data->fd_data_size;
  672 +
  673 + while (maxfd >= n)
  674 + n <<= 1;
  675 +
  676 + ptr = (_kq_fd_data_t *)realloc(_st_kq_data->fd_data,
  677 + n * sizeof(_kq_fd_data_t));
  678 + if (!ptr)
  679 + return -1;
  680 +
  681 + memset(ptr + _st_kq_data->fd_data_size, 0,
  682 + (n - _st_kq_data->fd_data_size) * sizeof(_kq_fd_data_t));
  683 +
  684 + _st_kq_data->fd_data = ptr;
  685 + _st_kq_data->fd_data_size = n;
  686 +
  687 + return 0;
  688 +}
  689 +
  690 +ST_HIDDEN int _st_kq_addlist_expand(int avail)
  691 +{
  692 + struct kevent *ptr;
  693 + int n = _st_kq_data->addlist_size;
  694 +
  695 + while (avail > n - _st_kq_data->addlist_cnt)
  696 + n <<= 1;
  697 +
  698 + ptr = (struct kevent *)realloc(_st_kq_data->addlist,
  699 + n * sizeof(struct kevent));
  700 + if (!ptr)
  701 + return -1;
  702 +
  703 + _st_kq_data->addlist = ptr;
  704 + _st_kq_data->addlist_size = n;
  705 +
  706 + /*
  707 + * Try to expand the result event list too
  708 + * (although we don't have to do it).
  709 + */
  710 + ptr = (struct kevent *)realloc(_st_kq_data->evtlist,
  711 + n * sizeof(struct kevent));
  712 + if (ptr) {
  713 + _st_kq_data->evtlist = ptr;
  714 + _st_kq_data->evtlist_size = n;
  715 + }
  716 +
  717 + return 0;
  718 +}
  719 +
  720 +ST_HIDDEN void _st_kq_addlist_add(const struct kevent *kev)
  721 +{
  722 + ST_ASSERT(_st_kq_data->addlist_cnt < _st_kq_data->addlist_size);
  723 + memcpy(_st_kq_data->addlist + _st_kq_data->addlist_cnt, kev,
  724 + sizeof(struct kevent));
  725 + _st_kq_data->addlist_cnt++;
  726 +}
  727 +
  728 +ST_HIDDEN void _st_kq_dellist_add(const struct kevent *kev)
  729 +{
  730 + int n = _st_kq_data->dellist_size;
  731 +
  732 + if (_st_kq_data->dellist_cnt >= n) {
  733 + struct kevent *ptr;
  734 +
  735 + n <<= 1;
  736 + ptr = (struct kevent *)realloc(_st_kq_data->dellist,
  737 + n * sizeof(struct kevent));
  738 + if (!ptr) {
  739 + /* See comment in _st_kq_pollset_del() */
  740 + return;
  741 + }
  742 +
  743 + _st_kq_data->dellist = ptr;
  744 + _st_kq_data->dellist_size = n;
  745 + }
  746 +
  747 + memcpy(_st_kq_data->dellist + _st_kq_data->dellist_cnt, kev,
  748 + sizeof(struct kevent));
  749 + _st_kq_data->dellist_cnt++;
  750 +}
  751 +
  752 +ST_HIDDEN int _st_kq_pollset_add(struct pollfd *pds, int npds)
  753 +{
  754 + struct kevent kev;
  755 + struct pollfd *pd;
  756 + struct pollfd *epd = pds + npds;
  757 +
  758 + /*
  759 + * Pollset adding is "atomic". That is, either it succeeded for
  760 + * all descriptors in the set or it failed. It means that we
  761 + * need to do all the checks up front so we don't have to
  762 + * "unwind" if adding of one of the descriptors failed.
  763 + */
  764 + for (pd = pds; pd < epd; pd++) {
  765 + /* POLLIN and/or POLLOUT must be set, but nothing else */
  766 + if (pd->fd < 0 || !pd->events || (pd->events & ~(POLLIN | POLLOUT))) {
  767 + errno = EINVAL;
  768 + return -1;
  769 + }
  770 + if (pd->fd >= _st_kq_data->fd_data_size &&
  771 + _st_kq_fd_data_expand(pd->fd) < 0)
  772 + return -1;
  773 + }
  774 +
  775 + /*
  776 + * Make sure we have enough room in the addlist for twice as many
  777 + * descriptors as in the pollset (for both READ and WRITE filters).
  778 + */
  779 + npds <<= 1;
  780 + if (npds > _st_kq_data->addlist_size - _st_kq_data->addlist_cnt &&
  781 + _st_kq_addlist_expand(npds) < 0)
  782 + return -1;
  783 +
  784 + for (pd = pds; pd < epd; pd++) {
  785 + if ((pd->events & POLLIN) && (_ST_KQ_READ_CNT(pd->fd)++ == 0)) {
  786 + memset(&kev, 0, sizeof(kev));
  787 + kev.ident = pd->fd;
  788 + kev.filter = EVFILT_READ;
  789 +#ifdef NOTE_EOF
  790 + /* Make it behave like select() and poll() */
  791 + kev.fflags = NOTE_EOF;
  792 +#endif
  793 + kev.flags = (EV_ADD | EV_ONESHOT);
  794 + _st_kq_addlist_add(&kev);
  795 + }
  796 + if ((pd->events & POLLOUT) && (_ST_KQ_WRITE_CNT(pd->fd)++ == 0)) {
  797 + memset(&kev, 0, sizeof(kev));
  798 + kev.ident = pd->fd;
  799 + kev.filter = EVFILT_WRITE;
  800 + kev.flags = (EV_ADD | EV_ONESHOT);
  801 + _st_kq_addlist_add(&kev);
  802 + }
  803 + }
  804 +
  805 + return 0;
  806 +}
  807 +
  808 +ST_HIDDEN void _st_kq_pollset_del(struct pollfd *pds, int npds)
  809 +{
  810 + struct kevent kev;
  811 + struct pollfd *pd;
  812 + struct pollfd *epd = pds + npds;
  813 +
  814 + /*
  815 + * It's OK if deleting fails because a descriptor will either be
  816 + * closed or fire only once (we set EV_ONESHOT flag).
  817 + */
  818 + _st_kq_data->dellist_cnt = 0;
  819 + for (pd = pds; pd < epd; pd++) {
  820 + if ((pd->events & POLLIN) && (--_ST_KQ_READ_CNT(pd->fd) == 0)) {
  821 + memset(&kev, 0, sizeof(kev));
  822 + kev.ident = pd->fd;
  823 + kev.filter = EVFILT_READ;
  824 + kev.flags = EV_DELETE;
  825 + _st_kq_dellist_add(&kev);
  826 + }
  827 + if ((pd->events & POLLOUT) && (--_ST_KQ_WRITE_CNT(pd->fd) == 0)) {
  828 + memset(&kev, 0, sizeof(kev));
  829 + kev.ident = pd->fd;
  830 + kev.filter = EVFILT_WRITE;
  831 + kev.flags = EV_DELETE;
  832 + _st_kq_dellist_add(&kev);
  833 + }
  834 + }
  835 +
  836 + if (_st_kq_data->dellist_cnt > 0) {
  837 + /*
  838 + * We do "synchronous" kqueue deletes to avoid deleting
  839 + * closed descriptors and other possible problems.
  840 + */
  841 + int rv;
  842 + do {
  843 + /* This kevent() won't block since result list size is 0 */
  844 + rv = kevent(_st_kq_data->kq, _st_kq_data->dellist,
  845 + _st_kq_data->dellist_cnt, NULL, 0, NULL);
  846 + } while (rv < 0 && errno == EINTR);
  847 + }
  848 +}
  849 +
  850 +ST_HIDDEN void _st_kq_dispatch(void)
  851 +{
  852 + struct timespec timeout, *tsp;
  853 + struct kevent kev;
  854 + st_utime_t min_timeout;
  855 + _st_clist_t *q;
  856 + _st_pollq_t *pq;
  857 + struct pollfd *pds, *epds;
  858 + int nfd, i, osfd, notify, filter;
  859 + short events, revents;
  860 +
  861 + if (_ST_SLEEPQ == NULL) {
  862 + tsp = NULL;
  863 + } else {
  864 + min_timeout = (_ST_SLEEPQ->due <= _ST_LAST_CLOCK) ? 0 :
  865 + (_ST_SLEEPQ->due - _ST_LAST_CLOCK);
  866 + timeout.tv_sec = (time_t) (min_timeout / 1000000);
  867 + timeout.tv_nsec = (long) ((min_timeout % 1000000) * 1000);
  868 + tsp = &timeout;
  869 + }
  870 +
  871 + retry_kevent:
  872 + /* Check for I/O operations */
  873 + nfd = kevent(_st_kq_data->kq,
  874 + _st_kq_data->addlist, _st_kq_data->addlist_cnt,
  875 + _st_kq_data->evtlist, _st_kq_data->evtlist_size, tsp);
  876 +
  877 + _st_kq_data->addlist_cnt = 0;
  878 +
  879 + if (nfd > 0) {
  880 + for (i = 0; i < nfd; i++) {
  881 + osfd = _st_kq_data->evtlist[i].ident;
  882 + filter = _st_kq_data->evtlist[i].filter;
  883 +
  884 + if (filter == EVFILT_READ) {
  885 + _ST_KQ_REVENTS(osfd) |= POLLIN;
  886 + } else if (filter == EVFILT_WRITE) {
  887 + _ST_KQ_REVENTS(osfd) |= POLLOUT;
  888 + }
  889 + if (_st_kq_data->evtlist[i].flags & EV_ERROR) {
  890 + if (_st_kq_data->evtlist[i].data == EBADF) {
  891 + _ST_KQ_REVENTS(osfd) |= POLLNVAL;
  892 + } else {
  893 + _ST_KQ_REVENTS(osfd) |= POLLERR;
  894 + }
  895 + }
  896 + }
  897 +
  898 + _st_kq_data->dellist_cnt = 0;
  899 +
  900 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  901 + pq = _ST_POLLQUEUE_PTR(q);
  902 + notify = 0;
  903 + epds = pq->pds + pq->npds;
  904 +
  905 + for (pds = pq->pds; pds < epds; pds++) {
  906 + osfd = pds->fd;
  907 + events = pds->events;
  908 + revents = (short)(_ST_KQ_REVENTS(osfd) & ~(POLLIN | POLLOUT));
  909 + if ((events & POLLIN) && (_ST_KQ_REVENTS(osfd) & POLLIN)) {
  910 + revents |= POLLIN;
  911 + }
  912 + if ((events & POLLOUT) && (_ST_KQ_REVENTS(osfd) & POLLOUT)) {
  913 + revents |= POLLOUT;
  914 + }
  915 + pds->revents = revents;
  916 + if (revents) {
  917 + notify = 1;
  918 + }
  919 + }
  920 + if (notify) {
  921 + ST_REMOVE_LINK(&pq->links);
  922 + pq->on_ioq = 0;
  923 + for (pds = pq->pds; pds < epds; pds++) {
  924 + osfd = pds->fd;
  925 + events = pds->events;
  926 + /*
  927 + * We set EV_ONESHOT flag so we only need to delete
  928 + * descriptor if it didn't fire.
  929 + */
  930 + if ((events & POLLIN) && (--_ST_KQ_READ_CNT(osfd) == 0) &&
  931 + ((_ST_KQ_REVENTS(osfd) & POLLIN) == 0)) {
  932 + memset(&kev, 0, sizeof(kev));
  933 + kev.ident = osfd;
  934 + kev.filter = EVFILT_READ;
  935 + kev.flags = EV_DELETE;
  936 + _st_kq_dellist_add(&kev);
  937 + }
  938 + if ((events & POLLOUT) && (--_ST_KQ_WRITE_CNT(osfd) == 0)
  939 + && ((_ST_KQ_REVENTS(osfd) & POLLOUT) == 0)) {
  940 + memset(&kev, 0, sizeof(kev));
  941 + kev.ident = osfd;
  942 + kev.filter = EVFILT_WRITE;
  943 + kev.flags = EV_DELETE;
  944 + _st_kq_dellist_add(&kev);
  945 + }
  946 + }
  947 +
  948 + if (pq->thread->flags & _ST_FL_ON_SLEEPQ)
  949 + _ST_DEL_SLEEPQ(pq->thread);
  950 + pq->thread->state = _ST_ST_RUNNABLE;
  951 + _ST_ADD_RUNQ(pq->thread);
  952 + }
  953 + }
  954 +
  955 + if (_st_kq_data->dellist_cnt > 0) {
  956 + int rv;
  957 + do {
  958 + /* This kevent() won't block since result list size is 0 */
  959 + rv = kevent(_st_kq_data->kq, _st_kq_data->dellist,
  960 + _st_kq_data->dellist_cnt, NULL, 0, NULL);
  961 + } while (rv < 0 && errno == EINTR);
  962 + }
  963 +
  964 + for (i = 0; i < nfd; i++) {
  965 + osfd = _st_kq_data->evtlist[i].ident;
  966 + _ST_KQ_REVENTS(osfd) = 0;
  967 + }
  968 +
  969 + } else if (nfd < 0) {
  970 + if (errno == EBADF && _st_kq_data->pid != getpid()) {
  971 + /* We probably forked, reinitialize kqueue */
  972 + if ((_st_kq_data->kq = kqueue()) < 0) {
  973 + /* There is nothing we can do here, will retry later */
  974 + return;
  975 + }
  976 + fcntl(_st_kq_data->kq, F_SETFD, FD_CLOEXEC);
  977 + _st_kq_data->pid = getpid();
  978 + /* Re-register all descriptors on ioq with new kqueue */
  979 + memset(_st_kq_data->fd_data, 0,
  980 + _st_kq_data->fd_data_size * sizeof(_kq_fd_data_t));
  981 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  982 + pq = _ST_POLLQUEUE_PTR(q);
  983 + _st_kq_pollset_add(pq->pds, pq->npds);
  984 + }
  985 + goto retry_kevent;
  986 + }
  987 + }
  988 +}
  989 +
  990 +ST_HIDDEN int _st_kq_fd_new(int osfd)
  991 +{
  992 + if (osfd >= _st_kq_data->fd_data_size && _st_kq_fd_data_expand(osfd) < 0)
  993 + return -1;
  994 +
  995 + return 0;
  996 +}
  997 +
  998 +ST_HIDDEN int _st_kq_fd_close(int osfd)
  999 +{
  1000 + if (_ST_KQ_READ_CNT(osfd) || _ST_KQ_WRITE_CNT(osfd)) {
  1001 + errno = EBUSY;
  1002 + return -1;
  1003 + }
  1004 +
  1005 + return 0;
  1006 +}
  1007 +
  1008 +ST_HIDDEN int _st_kq_fd_getlimit(void)
  1009 +{
  1010 + /* zero means no specific limit */
  1011 + return 0;
  1012 +}
  1013 +
  1014 +static _st_eventsys_t _st_kq_eventsys = {
  1015 + "kqueue",
  1016 + ST_EVENTSYS_ALT,
  1017 + _st_kq_init,
  1018 + _st_kq_dispatch,
  1019 + _st_kq_pollset_add,
  1020 + _st_kq_pollset_del,
  1021 + _st_kq_fd_new,
  1022 + _st_kq_fd_close,
  1023 + _st_kq_fd_getlimit
  1024 +};
  1025 +#endif /* MD_HAVE_KQUEUE */
  1026 +
  1027 +
  1028 +#ifdef MD_HAVE_EPOLL
  1029 +/*****************************************
  1030 + * epoll event system
  1031 + */
  1032 +
  1033 +ST_HIDDEN int _st_epoll_init(void)
  1034 +{
  1035 + int fdlim;
  1036 + int err = 0;
  1037 + int rv = 0;
  1038 +
  1039 + _st_epoll_data =
  1040 + (struct _st_epolldata *) calloc(1, sizeof(*_st_epoll_data));
  1041 + if (!_st_epoll_data)
  1042 + return -1;
  1043 +
  1044 + fdlim = st_getfdlimit();
  1045 + _st_epoll_data->fd_hint = (fdlim > 0 && fdlim < ST_EPOLL_EVTLIST_SIZE) ?
  1046 + fdlim : ST_EPOLL_EVTLIST_SIZE;
  1047 +
  1048 + if ((_st_epoll_data->epfd = epoll_create(_st_epoll_data->fd_hint)) < 0) {
  1049 + err = errno;
  1050 + rv = -1;
  1051 + goto cleanup_epoll;
  1052 + }
  1053 + fcntl(_st_epoll_data->epfd, F_SETFD, FD_CLOEXEC);
  1054 + _st_epoll_data->pid = getpid();
  1055 +
  1056 + /* Allocate file descriptor data array */
  1057 + _st_epoll_data->fd_data_size = _st_epoll_data->fd_hint;
  1058 + _st_epoll_data->fd_data =
  1059 + (_epoll_fd_data_t *)calloc(_st_epoll_data->fd_data_size,
  1060 + sizeof(_epoll_fd_data_t));
  1061 + if (!_st_epoll_data->fd_data) {
  1062 + err = errno;
  1063 + rv = -1;
  1064 + goto cleanup_epoll;
  1065 + }
  1066 +
  1067 + /* Allocate event lists */
  1068 + _st_epoll_data->evtlist_size = _st_epoll_data->fd_hint;
  1069 + _st_epoll_data->evtlist =
  1070 + (struct epoll_event *)malloc(_st_epoll_data->evtlist_size *
  1071 + sizeof(struct epoll_event));
  1072 + if (!_st_epoll_data->evtlist) {
  1073 + err = errno;
  1074 + rv = -1;
  1075 + }
  1076 +
  1077 + cleanup_epoll:
  1078 + if (rv < 0) {
  1079 + if (_st_epoll_data->epfd >= 0)
  1080 + close(_st_epoll_data->epfd);
  1081 + free(_st_epoll_data->fd_data);
  1082 + free(_st_epoll_data->evtlist);
  1083 + free(_st_epoll_data);
  1084 + _st_epoll_data = NULL;
  1085 + errno = err;
  1086 + }
  1087 +
  1088 + return rv;
  1089 +}
  1090 +
  1091 +ST_HIDDEN int _st_epoll_fd_data_expand(int maxfd)
  1092 +{
  1093 + _epoll_fd_data_t *ptr;
  1094 + int n = _st_epoll_data->fd_data_size;
  1095 +
  1096 + while (maxfd >= n)
  1097 + n <<= 1;
  1098 +
  1099 + ptr = (_epoll_fd_data_t *)realloc(_st_epoll_data->fd_data,
  1100 + n * sizeof(_epoll_fd_data_t));
  1101 + if (!ptr)
  1102 + return -1;
  1103 +
  1104 + memset(ptr + _st_epoll_data->fd_data_size, 0,
  1105 + (n - _st_epoll_data->fd_data_size) * sizeof(_epoll_fd_data_t));
  1106 +
  1107 + _st_epoll_data->fd_data = ptr;
  1108 + _st_epoll_data->fd_data_size = n;
  1109 +
  1110 + return 0;
  1111 +}
  1112 +
  1113 +ST_HIDDEN void _st_epoll_evtlist_expand(void)
  1114 +{
  1115 + struct epoll_event *ptr;
  1116 + int n = _st_epoll_data->evtlist_size;
  1117 +
  1118 + while (_st_epoll_data->evtlist_cnt > n)
  1119 + n <<= 1;
  1120 +
  1121 + ptr = (struct epoll_event *)realloc(_st_epoll_data->evtlist,
  1122 + n * sizeof(struct epoll_event));
  1123 + if (ptr) {
  1124 + _st_epoll_data->evtlist = ptr;
  1125 + _st_epoll_data->evtlist_size = n;
  1126 + }
  1127 +}
  1128 +
  1129 +ST_HIDDEN void _st_epoll_pollset_del(struct pollfd *pds, int npds)
  1130 +{
  1131 + struct epoll_event ev;
  1132 + struct pollfd *pd;
  1133 + struct pollfd *epd = pds + npds;
  1134 + int old_events, events, op;
  1135 +
  1136 + /*
  1137 + * It's more or less OK if deleting fails because a descriptor
  1138 + * will either be closed or deleted in dispatch function after
  1139 + * it fires.
  1140 + */
  1141 + for (pd = pds; pd < epd; pd++) {
  1142 + old_events = _ST_EPOLL_EVENTS(pd->fd);
  1143 +
  1144 + if (pd->events & POLLIN)
  1145 + _ST_EPOLL_READ_CNT(pd->fd)--;
  1146 + if (pd->events & POLLOUT)
  1147 + _ST_EPOLL_WRITE_CNT(pd->fd)--;
  1148 + if (pd->events & POLLPRI)
  1149 + _ST_EPOLL_EXCEP_CNT(pd->fd)--;
  1150 +
  1151 + events = _ST_EPOLL_EVENTS(pd->fd);
  1152 + /*
  1153 + * The _ST_EPOLL_REVENTS check below is needed so we can use
  1154 + * this function inside dispatch(). Outside of dispatch()
  1155 + * _ST_EPOLL_REVENTS is always zero for all descriptors.
  1156 + */
  1157 + if (events != old_events && _ST_EPOLL_REVENTS(pd->fd) == 0) {
  1158 + op = events ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
  1159 + ev.events = events;
  1160 + ev.data.fd = pd->fd;
  1161 + if (epoll_ctl(_st_epoll_data->epfd, op, pd->fd, &ev) == 0 &&
  1162 + op == EPOLL_CTL_DEL) {
  1163 + _st_epoll_data->evtlist_cnt--;
  1164 + }
  1165 + }
  1166 + }
  1167 +}
  1168 +
  1169 +ST_HIDDEN int _st_epoll_pollset_add(struct pollfd *pds, int npds)
  1170 +{
  1171 + struct epoll_event ev;
  1172 + int i, fd;
  1173 + int old_events, events, op;
  1174 +
  1175 + /* Do as many checks as possible up front */
  1176 + for (i = 0; i < npds; i++) {
  1177 + fd = pds[i].fd;
  1178 + if (fd < 0 || !pds[i].events ||
  1179 + (pds[i].events & ~(POLLIN | POLLOUT | POLLPRI))) {
  1180 + errno = EINVAL;
  1181 + return -1;
  1182 + }
  1183 + if (fd >= _st_epoll_data->fd_data_size &&
  1184 + _st_epoll_fd_data_expand(fd) < 0)
  1185 + return -1;
  1186 + }
  1187 +
  1188 + for (i = 0; i < npds; i++) {
  1189 + fd = pds[i].fd;
  1190 + old_events = _ST_EPOLL_EVENTS(fd);
  1191 +
  1192 + if (pds[i].events & POLLIN)
  1193 + _ST_EPOLL_READ_CNT(fd)++;
  1194 + if (pds[i].events & POLLOUT)
  1195 + _ST_EPOLL_WRITE_CNT(fd)++;
  1196 + if (pds[i].events & POLLPRI)
  1197 + _ST_EPOLL_EXCEP_CNT(fd)++;
  1198 +
  1199 + events = _ST_EPOLL_EVENTS(fd);
  1200 + if (events != old_events) {
  1201 + op = old_events ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
  1202 + ev.events = events;
  1203 + ev.data.fd = fd;
  1204 + if (epoll_ctl(_st_epoll_data->epfd, op, fd, &ev) < 0 &&
  1205 + (op != EPOLL_CTL_ADD || errno != EEXIST))
  1206 + break;
  1207 + if (op == EPOLL_CTL_ADD) {
  1208 + _st_epoll_data->evtlist_cnt++;
  1209 + if (_st_epoll_data->evtlist_cnt > _st_epoll_data->evtlist_size)
  1210 + _st_epoll_evtlist_expand();
  1211 + }
  1212 + }
  1213 + }
  1214 +
  1215 + if (i < npds) {
  1216 + /* Error */
  1217 + int err = errno;
  1218 + /* Unroll the state */
  1219 + _st_epoll_pollset_del(pds, i + 1);
  1220 + errno = err;
  1221 + return -1;
  1222 + }
  1223 +
  1224 + return 0;
  1225 +}
  1226 +
  1227 +ST_HIDDEN void _st_epoll_dispatch(void)
  1228 +{
  1229 + st_utime_t min_timeout;
  1230 + _st_clist_t *q;
  1231 + _st_pollq_t *pq;
  1232 + struct pollfd *pds, *epds;
  1233 + struct epoll_event ev;
  1234 + int timeout, nfd, i, osfd, notify;
  1235 + int events, op;
  1236 + short revents;
  1237 +
  1238 + if (_ST_SLEEPQ == NULL) {
  1239 + timeout = -1;
  1240 + } else {
  1241 + min_timeout = (_ST_SLEEPQ->due <= _ST_LAST_CLOCK) ? 0 :
  1242 + (_ST_SLEEPQ->due - _ST_LAST_CLOCK);
  1243 + timeout = (int) (min_timeout / 1000);
  1244 + }
  1245 +
  1246 + if (_st_epoll_data->pid != getpid()) {
  1247 + /* We probably forked, reinitialize epoll set */
  1248 + close(_st_epoll_data->epfd);
  1249 + _st_epoll_data->epfd = epoll_create(_st_epoll_data->fd_hint);
  1250 + if (_st_epoll_data->epfd < 0) {
  1251 + /* There is nothing we can do here, will retry later */
  1252 + return;
  1253 + }
  1254 + fcntl(_st_epoll_data->epfd, F_SETFD, FD_CLOEXEC);
  1255 + _st_epoll_data->pid = getpid();
  1256 +
  1257 + /* Put all descriptors on ioq into new epoll set */
  1258 + memset(_st_epoll_data->fd_data, 0,
  1259 + _st_epoll_data->fd_data_size * sizeof(_epoll_fd_data_t));
  1260 + _st_epoll_data->evtlist_cnt = 0;
  1261 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  1262 + pq = _ST_POLLQUEUE_PTR(q);
  1263 + _st_epoll_pollset_add(pq->pds, pq->npds);
  1264 + }
  1265 + }
  1266 +
  1267 + /* Check for I/O operations */
  1268 + nfd = epoll_wait(_st_epoll_data->epfd, _st_epoll_data->evtlist,
  1269 + _st_epoll_data->evtlist_size, timeout);
  1270 +
  1271 + if (nfd > 0) {
  1272 + for (i = 0; i < nfd; i++) {
  1273 + osfd = _st_epoll_data->evtlist[i].data.fd;
  1274 + _ST_EPOLL_REVENTS(osfd) = _st_epoll_data->evtlist[i].events;
  1275 + if (_ST_EPOLL_REVENTS(osfd) & (EPOLLERR | EPOLLHUP)) {
  1276 + /* Also set I/O bits on error */
  1277 + _ST_EPOLL_REVENTS(osfd) |= _ST_EPOLL_EVENTS(osfd);
  1278 + }
  1279 + }
  1280 +
  1281 + for (q = _ST_IOQ.next; q != &_ST_IOQ; q = q->next) {
  1282 + pq = _ST_POLLQUEUE_PTR(q);
  1283 + notify = 0;
  1284 + epds = pq->pds + pq->npds;
  1285 +
  1286 + for (pds = pq->pds; pds < epds; pds++) {
  1287 + if (_ST_EPOLL_REVENTS(pds->fd) == 0) {
  1288 + pds->revents = 0;
  1289 + continue;
  1290 + }
  1291 + osfd = pds->fd;
  1292 + events = pds->events;
  1293 + revents = 0;
  1294 + if ((events & POLLIN) && (_ST_EPOLL_REVENTS(osfd) & EPOLLIN))
  1295 + revents |= POLLIN;
  1296 + if ((events & POLLOUT) && (_ST_EPOLL_REVENTS(osfd) & EPOLLOUT))
  1297 + revents |= POLLOUT;
  1298 + if ((events & POLLPRI) && (_ST_EPOLL_REVENTS(osfd) & EPOLLPRI))
  1299 + revents |= POLLPRI;
  1300 + if (_ST_EPOLL_REVENTS(osfd) & EPOLLERR)
  1301 + revents |= POLLERR;
  1302 + if (_ST_EPOLL_REVENTS(osfd) & EPOLLHUP)
  1303 + revents |= POLLHUP;
  1304 +
  1305 + pds->revents = revents;
  1306 + if (revents) {
  1307 + notify = 1;
  1308 + }
  1309 + }
  1310 + if (notify) {
  1311 + ST_REMOVE_LINK(&pq->links);
  1312 + pq->on_ioq = 0;
  1313 + /*
  1314 + * Here we will only delete/modify descriptors that
  1315 + * didn't fire (see comments in _st_epoll_pollset_del()).
  1316 + */
  1317 + _st_epoll_pollset_del(pq->pds, pq->npds);
  1318 +
  1319 + if (pq->thread->flags & _ST_FL_ON_SLEEPQ)
  1320 + _ST_DEL_SLEEPQ(pq->thread);
  1321 + pq->thread->state = _ST_ST_RUNNABLE;
  1322 + _ST_ADD_RUNQ(pq->thread);
  1323 + }
  1324 + }
  1325 +
  1326 + for (i = 0; i < nfd; i++) {
  1327 + /* Delete/modify descriptors that fired */
  1328 + osfd = _st_epoll_data->evtlist[i].data.fd;
  1329 + _ST_EPOLL_REVENTS(osfd) = 0;
  1330 + events = _ST_EPOLL_EVENTS(osfd);
  1331 + op = events ? EPOLL_CTL_MOD : EPOLL_CTL_DEL;
  1332 + ev.events = events;
  1333 + ev.data.fd = osfd;
  1334 + if (epoll_ctl(_st_epoll_data->epfd, op, osfd, &ev) == 0 &&
  1335 + op == EPOLL_CTL_DEL) {
  1336 + _st_epoll_data->evtlist_cnt--;
  1337 + }
  1338 + }
  1339 + }
  1340 +}
  1341 +
  1342 +ST_HIDDEN int _st_epoll_fd_new(int osfd)
  1343 +{
  1344 + if (osfd >= _st_epoll_data->fd_data_size &&
  1345 + _st_epoll_fd_data_expand(osfd) < 0)
  1346 + return -1;
  1347 +
  1348 + return 0;
  1349 +}
  1350 +
  1351 +ST_HIDDEN int _st_epoll_fd_close(int osfd)
  1352 +{
  1353 + if (_ST_EPOLL_READ_CNT(osfd) || _ST_EPOLL_WRITE_CNT(osfd) ||
  1354 + _ST_EPOLL_EXCEP_CNT(osfd)) {
  1355 + errno = EBUSY;
  1356 + return -1;
  1357 + }
  1358 +
  1359 + return 0;
  1360 +}
  1361 +
  1362 +ST_HIDDEN int _st_epoll_fd_getlimit(void)
  1363 +{
  1364 + /* zero means no specific limit */
  1365 + return 0;
  1366 +}
  1367 +
  1368 +/*
  1369 + * Check if epoll functions are just stubs.
  1370 + */
  1371 +ST_HIDDEN int _st_epoll_is_supported(void)
  1372 +{
  1373 + struct epoll_event ev;
  1374 +
  1375 + ev.events = EPOLLIN;
  1376 + ev.data.ptr = NULL;
  1377 + /* Guaranteed to fail */
  1378 + epoll_ctl(-1, EPOLL_CTL_ADD, -1, &ev);
  1379 +
  1380 + return (errno != ENOSYS);
  1381 +}
  1382 +
  1383 +static _st_eventsys_t _st_epoll_eventsys = {
  1384 + "epoll",
  1385 + ST_EVENTSYS_ALT,
  1386 + _st_epoll_init,
  1387 + _st_epoll_dispatch,
  1388 + _st_epoll_pollset_add,
  1389 + _st_epoll_pollset_del,
  1390 + _st_epoll_fd_new,
  1391 + _st_epoll_fd_close,
  1392 + _st_epoll_fd_getlimit
  1393 +};
  1394 +#endif /* MD_HAVE_EPOLL */
  1395 +
  1396 +
  1397 +/*****************************************
  1398 + * Public functions
  1399 + */
  1400 +
  1401 +int st_set_eventsys(int eventsys)
  1402 +{
  1403 + if (_st_eventsys) {
  1404 + errno = EBUSY;
  1405 + return -1;
  1406 + }
  1407 +
  1408 + switch (eventsys) {
  1409 + case ST_EVENTSYS_DEFAULT:
  1410 +#ifdef USE_POLL
  1411 + _st_eventsys = &_st_poll_eventsys;
  1412 +#else
  1413 + _st_eventsys = &_st_select_eventsys;
  1414 +#endif
  1415 + break;
  1416 + case ST_EVENTSYS_SELECT:
  1417 + _st_eventsys = &_st_select_eventsys;
  1418 + break;
  1419 +#ifdef MD_HAVE_POLL
  1420 + case ST_EVENTSYS_POLL:
  1421 + _st_eventsys = &_st_poll_eventsys;
  1422 + break;
  1423 +#endif
  1424 + case ST_EVENTSYS_ALT:
  1425 +#if defined (MD_HAVE_KQUEUE)
  1426 + _st_eventsys = &_st_kq_eventsys;
  1427 +#elif defined (MD_HAVE_EPOLL)
  1428 + if (_st_epoll_is_supported())
  1429 + _st_eventsys = &_st_epoll_eventsys;
  1430 +#endif
  1431 + break;
  1432 + default:
  1433 + errno = EINVAL;
  1434 + return -1;
  1435 + }
  1436 +
  1437 + return 0;
  1438 +}
  1439 +
  1440 +int st_get_eventsys(void)
  1441 +{
  1442 + return _st_eventsys ? _st_eventsys->val : -1;
  1443 +}
  1444 +
  1445 +const char *st_get_eventsys_name(void)
  1446 +{
  1447 + return _st_eventsys ? _st_eventsys->name : "";
  1448 +}
  1449 +
  1 +#
  2 +# Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  3 +# All Rights Reserved.
  4 +#
  5 +# Redistribution and use in source and binary forms, with or without
  6 +# modification, are permitted provided that the following conditions
  7 +# are met:
  8 +#
  9 +# 1. Redistributions of source code must retain the above copyright
  10 +# notice, this list of conditions and the following disclaimer.
  11 +# 2. Redistributions in binary form must reproduce the above copyright
  12 +# notice, this list of conditions and the following disclaimer in the
  13 +# documentation and/or other materials provided with the distribution.
  14 +# 3. Neither the name of Silicon Graphics, Inc. nor the names of its
  15 +# contributors may be used to endorse or promote products derived from
  16 +# this software without specific prior written permission.
  17 +#
  18 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 +# ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 +# HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24 +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25 +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26 +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27 +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 +
  30 +##########################
  31 +# Supported OSes:
  32 +#
  33 +# AIX
  34 +# FREEBSD
  35 +# HPUX
  36 +# HPUX_64
  37 +# IRIX
  38 +# IRIX_64
  39 +# LINUX
  40 +# LINUX_IA64
  41 +# NETBSD
  42 +# OPENBSD
  43 +# OSF1
  44 +# SOLARIS
  45 +# SOLARIS_64
  46 +
  47 +##########################
  48 +
  49 +CC = cc
  50 +
  51 +SHELL = /bin/sh
  52 +ECHO = /bin/echo
  53 +
  54 +DEPTH = ..
  55 +BUILD =
  56 +TARGETDIR =
  57 +
  58 +DEFINES =
  59 +CFLAGS =
  60 +OTHER_FLAGS =
  61 +
  62 +OBJDIR = $(DEPTH)/$(TARGETDIR)
  63 +INCDIR = $(DEPTH)/$(TARGETDIR)
  64 +LIBST = $(OBJDIR)/libst.a
  65 +HEADER = $(INCDIR)/st.h
  66 +
  67 +LIBRESOLV =
  68 +EXTRALIBS =
  69 +
  70 +ifeq ($(OS),)
  71 +EXAMPLES = unknown
  72 +else
  73 +EXAMPLES = $(OBJDIR)/lookupdns $(OBJDIR)/proxy $(OBJDIR)/server
  74 +endif
  75 +
  76 +
  77 +##########################
  78 +# Platform section.
  79 +#
  80 +
  81 +ifeq (DARWIN, $(findstring DARWIN, $(OS)))
  82 +LIBRESOLV = -lresolv
  83 +endif
  84 +
  85 +ifeq (LINUX, $(findstring LINUX, $(OS)))
  86 +LIBRESOLV = -lresolv
  87 +endif
  88 +
  89 +ifeq (SOLARIS, $(findstring SOLARIS, $(OS)))
  90 +LIBRESOLV = -lresolv
  91 +EXTRALIBS = -lsocket -lnsl
  92 +endif
  93 +
  94 +#
  95 +# End of platform section.
  96 +##########################
  97 +
  98 +
  99 +all: $(EXAMPLES)
  100 +
  101 +$(OBJDIR)/lookupdns: lookupdns.c $(OBJDIR)/res.o $(LIBST) $(HEADER)
  102 + $(CC) $(CFLAGS) -I$(INCDIR) lookupdns.c $(OBJDIR)/res.o $(LIBST) $(LIBRESOLV) $(EXTRALIBS) -o $@
  103 +
  104 +$(OBJDIR)/proxy: proxy.c $(LIBST) $(HEADER)
  105 + $(CC) $(CFLAGS) -I$(INCDIR) proxy.c $(LIBST) $(EXTRALIBS) -o $@
  106 +
  107 +$(OBJDIR)/server: server.c $(OBJDIR)/error.o $(LIBST) $(HEADER)
  108 + $(CC) $(CFLAGS) -I$(INCDIR) server.c $(OBJDIR)/error.o $(LIBST) $(EXTRALIBS) -o $@
  109 +
  110 +$(OBJDIR)/%.o: %.c
  111 + $(CC) $(CFLAGS) -I$(INCDIR) -c $< -o $@
  112 +
  113 +.DEFAULT:
  114 + @cd $(DEPTH); $(MAKE) $@
  115 +
  1 +Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  2 +All Rights Reserved.
  3 +
  4 +
  5 +This directory contains three example programs.
  6 +
  7 +
  8 +---------------------------------------------------------------------------
  9 +
  10 +PROGRAM
  11 +
  12 + lookupdns
  13 +
  14 +FILES
  15 +
  16 + lookupdns.c
  17 + res.c
  18 +
  19 +USAGE
  20 +
  21 + lookupdns <hostname1> [<hostname2>] ...
  22 +
  23 +DESCRIPTION
  24 +
  25 + This program performs asynchronous DNS host name resolution and reports
  26 + IP address for each <hostname> specified as a command line argument.
  27 + One ST thread is created for each host name. All threads do host name
  28 + resolution concurrently.
  29 +
  30 +
  31 +---------------------------------------------------------------------------
  32 +
  33 +PROGRAM
  34 +
  35 + proxy
  36 +
  37 +FILES
  38 +
  39 + proxy.c
  40 +
  41 +USAGE
  42 +
  43 + proxy -l <local_addr> -r <remote_addr> [-p <num_processes>] [-S]
  44 +
  45 + -l <local_addr> bind to local address specified as [<host>]:<port>
  46 + -r <remote_addr> connect to remote address specified as <host>:<port>
  47 + -p <num_processes> create specified number of processes
  48 + -S serialize accept() calls from different processes
  49 + on the same listening socket (if needed).
  50 +
  51 +DESCRIPTION
  52 +
  53 + This program acts as a generic gateway. It listens for connections to a
  54 + local address. Upon accepting a client connection, it connects to the
  55 + specified remote address and then just pumps the data through without any
  56 + modification.
  57 +
  58 +
  59 +---------------------------------------------------------------------------
  60 +
  61 +PROGRAM
  62 +
  63 + server
  64 +
  65 +FILES
  66 +
  67 + server.c
  68 + error.c
  69 +
  70 +USAGE
  71 +
  72 + server -l <log_directory> [<options>]
  73 +
  74 + -l <log_directory> open all log files in specified directory.
  75 +
  76 + Possible options:
  77 +
  78 + -b <host>:<port> bind to specified address (multiple addresses
  79 + are permitted)
  80 + -p <num_processes> create specified number of processes
  81 + -t <min_thr>:<max_thr> specify thread limits per listening socket
  82 + across all processes
  83 + -u <user> change server's user id to specified value
  84 + -q <backlog> set max length of pending connections queue
  85 + -a enable access logging
  86 + -i run in interactive mode (useful for debugging)
  87 + -S serialize accept() calls from different processes
  88 + on the same listening socket (if needed).
  89 +
  90 +DESCRIPTION
  91 +
  92 + This program is a general server example. It accepts a client connection
  93 + and outputs a short HTML page. It can be easily adapted to provide
  94 + other services.
  95 +
  96 +
  97 +---------------------------------------------------------------------------
  98 +
  1 +/*
  2 + * Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  3 + * All Rights Reserved.
  4 + *
  5 + * Redistribution and use in source and binary forms, with or without
  6 + * modification, are permitted provided that the following conditions
  7 + * are met:
  8 + *
  9 + * 1. Redistributions of source code must retain the above copyright
  10 + * notice, this list of conditions and the following disclaimer.
  11 + * 2. Redistributions in binary form must reproduce the above copyright
  12 + * notice, this list of conditions and the following disclaimer in the
  13 + * documentation and/or other materials provided with the distribution.
  14 + * 3. Neither the name of Silicon Graphics, Inc. nor the names of its
  15 + * contributors may be used to endorse or promote products derived from
  16 + * this software without specific prior written permission.
  17 + *
  18 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 + * HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24 + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 + */
  30 +
  31 +#include <stdarg.h>
  32 +#include <stdio.h>
  33 +#include <stdlib.h>
  34 +#include <string.h>
  35 +#include <unistd.h>
  36 +#include <errno.h>
  37 +#include "st.h"
  38 +
  39 +/*
  40 + * Simple error reporting functions.
  41 + * Suggested in W. Richard Stevens' "Advanced Programming in UNIX
  42 + * Environment".
  43 + */
  44 +
  45 +#define MAXLINE 4096 /* max line length */
  46 +
  47 +static void err_doit(int, int, const char *, va_list);
  48 +
  49 +
  50 +/*
  51 + * Nonfatal error related to a system call.
  52 + * Print a message and return.
  53 + */
  54 +void err_sys_report(int fd, const char *fmt, ...)
  55 +{
  56 + va_list ap;
  57 +
  58 + va_start(ap, fmt);
  59 + err_doit(fd, 1, fmt, ap);
  60 + va_end(ap);
  61 +}
  62 +
  63 +
  64 +/*
  65 + * Fatal error related to a system call.
  66 + * Print a message and terminate.
  67 + */
  68 +void err_sys_quit(int fd, const char *fmt, ...)
  69 +{
  70 + va_list ap;
  71 +
  72 + va_start(ap, fmt);
  73 + err_doit(fd, 1, fmt, ap);
  74 + va_end(ap);
  75 + exit(1);
  76 +}
  77 +
  78 +
  79 +/*
  80 + * Fatal error related to a system call.
  81 + * Print a message, dump core, and terminate.
  82 + */
  83 +void err_sys_dump(int fd, const char *fmt, ...)
  84 +{
  85 + va_list ap;
  86 +
  87 + va_start(ap, fmt);
  88 + err_doit(fd, 1, fmt, ap);
  89 + va_end(ap);
  90 + abort(); /* dump core and terminate */
  91 + exit(1); /* shouldn't get here */
  92 +}
  93 +
  94 +
  95 +/*
  96 + * Nonfatal error unrelated to a system call.
  97 + * Print a message and return.
  98 + */
  99 +void err_report(int fd, const char *fmt, ...)
  100 +{
  101 + va_list ap;
  102 +
  103 + va_start(ap, fmt);
  104 + err_doit(fd, 0, fmt, ap);
  105 + va_end(ap);
  106 +}
  107 +
  108 +
  109 +/*
  110 + * Fatal error unrelated to a system call.
  111 + * Print a message and terminate.
  112 + */
  113 +void err_quit(int fd, const char *fmt, ...)
  114 +{
  115 + va_list ap;
  116 +
  117 + va_start(ap, fmt);
  118 + err_doit(fd, 0, fmt, ap);
  119 + va_end(ap);
  120 + exit(1);
  121 +}
  122 +
  123 +
  124 +/*
  125 + * Return a pointer to a string containing current time.
  126 + */
  127 +char *err_tstamp(void)
  128 +{
  129 + static char *months[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  130 + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
  131 + static char str[32];
  132 + static time_t lastt = 0;
  133 + struct tm *tmp;
  134 + time_t currt = st_time();
  135 +
  136 + if (currt == lastt)
  137 + return str;
  138 +
  139 + tmp = localtime(&currt);
  140 + sprintf(str, "[%02d/%s/%d:%02d:%02d:%02d] ", tmp->tm_mday,
  141 + months[tmp->tm_mon], 1900 + tmp->tm_year, tmp->tm_hour,
  142 + tmp->tm_min, tmp->tm_sec);
  143 + lastt = currt;
  144 +
  145 + return str;
  146 +}
  147 +
  148 +
  149 +/*
  150 + * Print a message and return to caller.
  151 + * Caller specifies "errnoflag".
  152 + */
  153 +static void err_doit(int fd, int errnoflag, const char *fmt, va_list ap)
  154 +{
  155 + int errno_save;
  156 + char buf[MAXLINE];
  157 +
  158 + errno_save = errno; /* value caller might want printed */
  159 + strcpy(buf, err_tstamp()); /* prepend a message with time stamp */
  160 + vsprintf(buf + strlen(buf), fmt, ap);
  161 + if (errnoflag)
  162 + sprintf(buf + strlen(buf), ": %s\n", strerror(errno_save));
  163 + else
  164 + strcat(buf, "\n");
  165 + write(fd, buf, strlen(buf));
  166 + errno = errno_save;
  167 +}
  168 +
  1 +/*
  2 + * Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  3 + * All Rights Reserved.
  4 + *
  5 + * Redistribution and use in source and binary forms, with or without
  6 + * modification, are permitted provided that the following conditions
  7 + * are met:
  8 + *
  9 + * 1. Redistributions of source code must retain the above copyright
  10 + * notice, this list of conditions and the following disclaimer.
  11 + * 2. Redistributions in binary form must reproduce the above copyright
  12 + * notice, this list of conditions and the following disclaimer in the
  13 + * documentation and/or other materials provided with the distribution.
  14 + * 3. Neither the name of Silicon Graphics, Inc. nor the names of its
  15 + * contributors may be used to endorse or promote products derived from
  16 + * this software without specific prior written permission.
  17 + *
  18 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 + * HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24 + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 + */
  30 +
  31 +#include <stdio.h>
  32 +#include <stdlib.h>
  33 +#include <unistd.h>
  34 +#include <sys/time.h>
  35 +#include <sys/types.h>
  36 +#include <sys/socket.h>
  37 +#include <netinet/in.h>
  38 +#include <arpa/inet.h>
  39 +#include <netdb.h>
  40 +#include "st.h"
  41 +
  42 +#if !defined(NETDB_INTERNAL) && defined(h_NETDB_INTERNAL)
  43 +#define NETDB_INTERNAL h_NETDB_INTERNAL
  44 +#endif
  45 +
  46 +/* Resolution timeout (in microseconds) */
  47 +#define TIMEOUT (2*1000000LL)
  48 +
  49 +/* External function defined in the res.c file */
  50 +int dns_getaddr(const char *host, struct in_addr *addr, st_utime_t timeout);
  51 +
  52 +
  53 +void *do_resolve(void *host)
  54 +{
  55 + struct in_addr addr;
  56 +
  57 + /* Use dns_getaddr() instead of gethostbyname(3) to get IP address */
  58 + if (dns_getaddr(host, &addr, TIMEOUT) < 0) {
  59 + fprintf(stderr, "dns_getaddr: can't resolve %s: ", (char *)host);
  60 + if (h_errno == NETDB_INTERNAL)
  61 + perror("");
  62 + else
  63 + herror("");
  64 + } else
  65 + printf("%-40s %s\n", (char *)host, inet_ntoa(addr));
  66 +
  67 + return NULL;
  68 +}
  69 +
  70 +
  71 +/*
  72 + * Asynchronous DNS host name resolution. This program creates one
  73 + * ST thread for each host name (specified as command line arguments).
  74 + * All threads do host name resolution concurrently.
  75 + */
  76 +int main(int argc, char *argv[])
  77 +{
  78 + int i;
  79 +
  80 + if (argc < 2) {
  81 + fprintf(stderr, "Usage: %s <hostname1> [<hostname2>] ...\n", argv[0]);
  82 + exit(1);
  83 + }
  84 +
  85 + if (st_init() < 0) {
  86 + perror("st_init");
  87 + exit(1);
  88 + }
  89 +
  90 + for (i = 1; i < argc; i++) {
  91 + /* Create a separate thread for each host name */
  92 + if (st_thread_create(do_resolve, argv[i], 0, 0) == NULL) {
  93 + perror("st_thread_create");
  94 + exit(1);
  95 + }
  96 + }
  97 +
  98 + st_thread_exit(NULL);
  99 +
  100 + /* NOTREACHED */
  101 + return 1;
  102 +}
  103 +
  1 +/*
  2 + * Portions created by SGI are Copyright (C) 2000 Silicon Graphics, Inc.
  3 + * All Rights Reserved.
  4 + *
  5 + * Redistribution and use in source and binary forms, with or without
  6 + * modification, are permitted provided that the following conditions
  7 + * are met:
  8 + *
  9 + * 1. Redistributions of source code must retain the above copyright
  10 + * notice, this list of conditions and the following disclaimer.
  11 + * 2. Redistributions in binary form must reproduce the above copyright
  12 + * notice, this list of conditions and the following disclaimer in the
  13 + * documentation and/or other materials provided with the distribution.
  14 + * 3. Neither the name of Silicon Graphics, Inc. nor the names of its
  15 + * contributors may be used to endorse or promote products derived from
  16 + * this software without specific prior written permission.
  17 + *
  18 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 + * HOLDERS AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  24 + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  25 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  26 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  27 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  28 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29 + */
  30 +
  31 +#include <stdio.h>
  32 +#include <stdlib.h>
  33 +#include <string.h>
  34 +#include <signal.h>
  35 +#include <unistd.h>
  36 +#include <fcntl.h>
  37 +#include <sys/types.h>
  38 +#include <sys/stat.h>
  39 +#include <sys/socket.h>
  40 +#include <netinet/in.h>
  41 +#include <arpa/inet.h>
  42 +#include <netdb.h>
  43 +#include "st.h"
  44 +
  45 +#define IOBUFSIZE (16*1024)
  46 +
  47 +#define IOV_LEN 256
  48 +#define IOV_COUNT (IOBUFSIZE / IOV_LEN)
  49 +
  50 +#ifndef INADDR_NONE
  51 +#define INADDR_NONE 0xffffffff
  52 +#endif
  53 +
  54 +static char *prog; /* Program name */
  55 +static struct sockaddr_in rmt_addr; /* Remote address */
  56 +
  57 +static unsigned long testing;
  58 +#define TESTING_VERBOSE 0x1
  59 +#define TESTING_READV 0x2
  60 +#define TESTING_READ_RESID 0x4
  61 +#define TESTING_WRITEV 0x8
  62 +#define TESTING_WRITE_RESID 0x10
  63 +
  64 +static void read_address(const char *str, struct sockaddr_in *sin);
  65 +static void start_daemon(void);
  66 +static int cpu_count(void);
  67 +static void set_concurrency(int nproc);
  68 +static void *handle_request(void *arg);
  69 +static void print_sys_error(const char *msg);
  70 +
  71 +
  72 +/*
  73 + * This program acts as a generic gateway. It listens for connections
  74 + * to a local address ('-l' option). Upon accepting a client connection,
  75 + * it connects to the specified remote address ('-r' option) and then
  76 + * just pumps the data through without any modification.
  77 + */
  78 +int main(int argc, char *argv[])
  79 +{
  80 + extern char *optarg;
  81 + int opt, sock, n;
  82 + int laddr, raddr, num_procs, alt_ev, one_process;
  83 + int serialize_accept = 0;
  84 + struct sockaddr_in lcl_addr, cli_addr;
  85 + st_netfd_t cli_nfd, srv_nfd;
  86 +
  87 + prog = argv[0];
  88 + num_procs = laddr = raddr = alt_ev = one_process = 0;
  89 +
  90 + /* Parse arguments */
  91 + while((opt = getopt(argc, argv, "l:r:p:Saht:X")) != EOF) {
  92 + switch (opt) {
  93 + case 'a':
  94 + alt_ev = 1;
  95 + break;
  96 + case 'l':
  97 + read_address(optarg, &lcl_addr);
  98 + laddr = 1;
  99 + break;
  100 + case 'r':
  101 + read_address(optarg, &rmt_addr);
  102 + if (rmt_addr.sin_addr.s_addr == INADDR_ANY) {
  103 + fprintf(stderr, "%s: invalid remote address: %s\n", prog, optarg);
  104 + exit(1);
  105 + }
  106 + raddr = 1;
  107 + break;
  108 + case 'p':
  109 + num_procs = atoi(optarg);
  110 + if (num_procs < 1) {
  111 + fprintf(stderr, "%s: invalid number of processes: %s\n", prog, optarg);
  112 + exit(1);
  113 + }
  114 + break;
  115 + case 'S':
  116 + /*
  117 + * Serialization decision is tricky on some platforms. For example,
  118 + * Solaris 2.6 and above has kernel sockets implementation, so supposedly
  119 + * there is no need for serialization. The ST library may be compiled
  120 + * on one OS version, but used on another, so the need for serialization
  121 + * should be determined at run time by the application. Since it's just
  122 + * an example, the serialization decision is left up to user.
  123 + * Only on platforms where the serialization is never needed on any OS
  124 + * version st_netfd_serialize_accept() is a no-op.
  125 + */
  126 + serialize_accept = 1;
  127 + break;
  128 + case 't':
  129 + testing = strtoul(optarg, NULL, 0);
  130 + break;
  131 + case 'X':
  132 + one_process = 1;
  133 + break;
  134 + case 'h':
  135 + case '?':
  136 + fprintf(stderr, "Usage: %s [options] -l <[host]:port> -r <host:port>\n",
  137 + prog);
  138 + fprintf(stderr, "options are:\n");
  139 + fprintf(stderr, " -p <num_processes> number of parallel processes\n");
  140 + fprintf(stderr, " -S serialize accepts\n");
  141 + fprintf(stderr, " -a use alternate event system\n");
  142 +#ifdef DEBUG
  143 + fprintf(stderr, " -t mask testing/debugging mode\n");
  144 + fprintf(stderr, " -X one process, don't daemonize\n");
  145 +#endif
  146 + exit(1);
  147 + }
  148 + }
  149 + if (!laddr) {
  150 + fprintf(stderr, "%s: local address required\n", prog);
  151 + exit(1);
  152 + }
  153 + if (!raddr) {
  154 + fprintf(stderr, "%s: remote address required\n", prog);
  155 + exit(1);
  156 + }
  157 + if (num_procs == 0)
  158 + num_procs = cpu_count();
  159 +
  160 + fprintf(stderr, "%s: starting proxy daemon on %s:%d\n", prog,
  161 + inet_ntoa(lcl_addr.sin_addr), ntohs(lcl_addr.sin_port));
  162 +
  163 + /* Start the daemon */
  164 + if (one_process)
  165 + num_procs = 1;
  166 + else
  167 + start_daemon();
  168 +
  169 + if (alt_ev)
  170 + st_set_eventsys(ST_EVENTSYS_ALT);
  171 +
  172 + /* Initialize the ST library */
  173 + if (st_init() < 0) {
  174 + print_sys_error("st_init");
  175 + exit(1);
  176 + }
  177 +
  178 + /* Create and bind listening socket */
  179 + if ((sock = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
  180 + print_sys_error("socket");
  181 + exit(1);
  182 + }
  183 + n = 1;
  184 + if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&n, sizeof(n)) < 0) {
  185 + print_sys_error("setsockopt");
  186 + exit(1);
  187 + }
  188 + if (bind(sock, (struct sockaddr *)&lcl_addr, sizeof(lcl_addr)) < 0) {
  189 + print_sys_error("bind");
  190 + exit(1);
  191 + }
  192 + listen(sock, 128);
  193 + if ((srv_nfd = st_netfd_open_socket(sock)) == NULL) {
  194 + print_sys_error("st_netfd_open");
  195 + exit(1);
  196 + }
  197 + /* See the comment regarding serialization decision above */
  198 + if (num_procs > 1 && serialize_accept && st_netfd_serialize_accept(srv_nfd)
  199 + < 0) {
  200 + print_sys_error("st_netfd_serialize_accept");
  201 + exit(1);
  202 + }
  203 +
  204 + /* Start server processes */
  205 + if (!one_process)
  206 + set_concurrency(num_procs);
  207 +
  208 + for ( ; ; ) {
  209 + n = sizeof(cli_addr);
  210 + cli_nfd = st_accept(srv_nfd, (struct sockaddr *)&cli_addr, &n,
  211 + ST_UTIME_NO_TIMEOUT);
  212 + if (cli_nfd == NULL) {
  213 + print_sys_error("st_accept");
  214 + exit(1);
  215 + }
  216 + if (st_thread_create(handle_request, cli_nfd, 0, 0) == NULL) {
  217 + print_sys_error("st_thread_create");
  218 + exit(1);
  219 + }
  220 + }
  221 +
  222 + /* NOTREACHED */
  223 + return 1;
  224 +}
  225 +
  226 +
  227 +static void read_address(const char *str, struct sockaddr_in *sin)
  228 +{
  229 + char host[128], *p;
  230 + struct hostent *hp;
  231 + unsigned short port;
  232 +
  233 + strcpy(host, str);
  234 + if ((p = strchr(host, ':')) == NULL) {
  235 + fprintf(stderr, "%s: invalid address: %s\n", prog, host);
  236 + exit(1);
  237 + }
  238 + *p++ = '\0';
  239 + port = (unsigned short) atoi(p);
  240 + if (port < 1) {
  241 + fprintf(stderr, "%s: invalid port: %s\n", prog, p);
  242 + exit(1);
  243 + }
  244 +
  245 + memset(sin, 0, sizeof(struct sockaddr_in));
  246 + sin->sin_family = AF_INET;
  247 + sin->sin_port = htons(port);
  248 + if (host[0] == '\0') {
  249 + sin->sin_addr.s_addr = INADDR_ANY;
  250 + return;
  251 + }
  252 + sin->sin_addr.s_addr = inet_addr(host);
  253 + if (sin->sin_addr.s_addr == INADDR_NONE) {
  254 + /* not dotted-decimal */
  255 + if ((hp = gethostbyname(host)) == NULL) {
  256 + fprintf(stderr, "%s: can't resolve address: %s\n", prog, host);
  257 + exit(1);
  258 + }
  259 + memcpy(&sin->sin_addr, hp->h_addr, hp->h_length);
  260 + }
  261 +}
  262 +
  263 +#ifdef DEBUG
  264 +static void show_iov(const struct iovec *iov, int niov)
  265 +{
  266 + int i;
  267 + size_t total;
  268 +
  269 + printf("iov %p has %d entries:\n", iov, niov);
  270 + total = 0;
  271 + for (i = 0; i < niov; i++) {
  272 + printf("iov[%3d] iov_base=%p iov_len=0x%lx(%lu)\n",
  273 + i, iov[i].iov_base, (unsigned long) iov[i].iov_len,
  274 + (unsigned long) iov[i].iov_len);
  275 + total += iov[i].iov_len;
  276 + }
  277 + printf("total 0x%lx(%ld)\n", (unsigned long) total, (unsigned long) total);
  278 +}
  279 +
  280 +/*
  281 + * This version is tricked out to test all the
  282 + * st_(read|write)v?(_resid)? variants. Use the non-DEBUG version for
  283 + * anything serious. st_(read|write) are all this function really
  284 + * needs.
  285 + */
  286 +static int pass(st_netfd_t in, st_netfd_t out)
  287 +{
  288 + char buf[IOBUFSIZE];
  289 + struct iovec iov[IOV_COUNT];
  290 + int ioviter, nw, nr;
  291 +
  292 + if (testing & TESTING_READV) {
  293 + for (ioviter = 0; ioviter < IOV_COUNT; ioviter++) {
  294 + iov[ioviter].iov_base = &buf[ioviter * IOV_LEN];
  295 + iov[ioviter].iov_len = IOV_LEN;
  296 + }
  297 + if (testing & TESTING_VERBOSE) {
  298 + printf("readv(%p)...\n", in);
  299 + show_iov(iov, IOV_COUNT);
  300 + }
  301 + if (testing & TESTING_READ_RESID) {
  302 + struct iovec *riov = iov;
  303 + int riov_cnt = IOV_COUNT;
  304 + if (st_readv_resid(in, &riov, &riov_cnt, ST_UTIME_NO_TIMEOUT) == 0) {
  305 + if (testing & TESTING_VERBOSE) {
  306 + printf("resid\n");
  307 + show_iov(riov, riov_cnt);
  308 + printf("full\n");
  309 + show_iov(iov, IOV_COUNT);
  310 + }
  311 + nr = 0;
  312 + for (ioviter = 0; ioviter < IOV_COUNT; ioviter++)
  313 + nr += iov[ioviter].iov_len;
  314 + nr = IOBUFSIZE - nr;
  315 + } else
  316 + nr = -1;
  317 + } else
  318 + nr = (int) st_readv(in, iov, IOV_COUNT, ST_UTIME_NO_TIMEOUT);
  319 + } else {
  320 + if (testing & TESTING_READ_RESID) {
  321 + size_t resid = IOBUFSIZE;
  322 + if (st_read_resid(in, buf, &resid, ST_UTIME_NO_TIMEOUT) == 0)
  323 + nr = IOBUFSIZE - resid;
  324 + else
  325 + nr = -1;
  326 + } else
  327 + nr = (int) st_read(in, buf, IOBUFSIZE, ST_UTIME_NO_TIMEOUT);
  328 + }
  329 + if (testing & TESTING_VERBOSE)
  330 + printf("got 0x%x(%d) E=%d\n", nr, nr, errno);
  331 +
  332 + if (nr <= 0)
  333 + return 0;
  334 +
  335 + if (testing & TESTING_WRITEV) {
  336 + for (nw = 0, ioviter = 0; nw < nr;
  337 + nw += iov[ioviter].iov_len, ioviter++) {
  338 + iov[ioviter].iov_base = &buf[nw];
  339 + iov[ioviter].iov_len = nr - nw;
  340 + if (iov[ioviter].iov_len > IOV_LEN)
  341 + iov[ioviter].iov_len = IOV_LEN;
  342 + }
  343 + if (testing & TESTING_VERBOSE) {
  344 + printf("writev(%p)...\n", out);
  345 + show_iov(iov, ioviter);
  346 + }
  347 + if (testing & TESTING_WRITE_RESID) {
  348 + struct iovec *riov = iov;
  349 + int riov_cnt = ioviter;
  350 + if (st_writev_resid(out, &riov, &riov_cnt, ST_UTIME_NO_TIMEOUT) == 0) {
  351 + if (testing & TESTING_VERBOSE) {
  352 + printf("resid\n");
  353 + show_iov(riov, riov_cnt);
  354 + printf("full\n");
  355 + show_iov(iov, ioviter);
  356 + }
  357 + nw = 0;
  358 + while (--ioviter >= 0)
  359 + nw += iov[ioviter].iov_len;
  360 + nw = nr - nw;
  361 + } else
  362 + nw = -1;
  363 + } else
  364 + nw = st_writev(out, iov, ioviter, ST_UTIME_NO_TIMEOUT);
  365 + } else {
  366 + if (testing & TESTING_WRITE_RESID) {
  367 + size_t resid = nr;
  368 + if (st_write_resid(out, buf, &resid, ST_UTIME_NO_TIMEOUT) == 0)
  369 + nw = nr - resid;
  370 + else
  371 + nw = -1;
  372 + } else
  373 + nw = st_write(out, buf, nr, ST_UTIME_NO_TIMEOUT);
  374 + }
  375 + if (testing & TESTING_VERBOSE)
  376 + printf("put 0x%x(%d) E=%d\n", nw, nw, errno);
  377 +
  378 + if (nw != nr)
  379 + return 0;
  380 +
  381 + return 1;
  382 +}
  383 +#else /* DEBUG */
  384 +/*
  385 + * This version is the simple one suitable for serious use.
  386 + */
  387 +static int pass(st_netfd_t in, st_netfd_t out)
  388 +{
  389 + char buf[IOBUFSIZE];
  390 + int nw, nr;
  391 +
  392 + nr = (int) st_read(in, buf, IOBUFSIZE, ST_UTIME_NO_TIMEOUT);
  393 + if (nr <= 0)
  394 + return 0;
  395 +
  396 + nw = st_write(out, buf, nr, ST_UTIME_NO_TIMEOUT);
  397 + if (nw != nr)
  398 + return 0;
  399 +
  400 + return 1;
  401 +}
  402 +#endif
  403 +
  404 +static void *handle_request(void *arg)
  405 +{
  406 + struct pollfd pds[2];
  407 + st_netfd_t cli_nfd, rmt_nfd;
  408 + int sock;
  409 +
  410 + cli_nfd = (st_netfd_t) arg;
  411 + pds[0].fd = st_netfd_fileno(cli_nfd);
  412 + pds[0].events = POLLIN;
  413 +
  414 + /* Connect to remote host */
  415 + if ((sock = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
  416 + print_sys_error("socket");
  417 + goto done;
  418 + }
  419 + if ((rmt_nfd = st_netfd_open_socket(sock)) == NULL) {
  420 + print_sys_error("st_netfd_open_socket");
  421 + close(sock);
  422 + goto done;
  423 + }
  424 + if (st_connect(rmt_nfd, (struct sockaddr *)&rmt_addr,
  425 + sizeof(rmt_addr), ST_UTIME_NO_TIMEOUT) < 0) {
  426 + print_sys_error("st_connect");
  427 + st_netfd_close(rmt_nfd);
  428 + goto done;
  429 + }
  430 + pds[1].fd = sock;
  431 + pds[1].events = POLLIN;
  432 +
  433 + /*
  434 + * Now just pump the data through.
  435 + * XXX This should use one thread for each direction for true full-duplex.
  436 + */
  437 + for ( ; ; ) {
  438 + pds[0].revents = 0;
  439 + pds[1].revents = 0;
  440 +
  441 + if (st_poll(pds, 2, ST_UTIME_NO_TIMEOUT) <= 0) {
  442 + print_sys_error("st_poll");
  443 + break;
  444 + }
  445 +
  446 + if (pds[0].revents & POLLIN) {
  447 + if (!pass(cli_nfd, rmt_nfd))
  448 + break;
  449 + }
  450 +
  451 + if (pds[1].revents & POLLIN) {
  452 + if (!pass(rmt_nfd, cli_nfd))
  453 + break;
  454 + }
  455 + }
  456 + st_netfd_close(rmt_nfd);
  457 +
  458 +done:
  459 +
  460 + st_netfd_close(cli_nfd);
  461 +
  462 + return NULL;
  463 +}
  464 +
  465 +static void start_daemon(void)
  466 +{
  467 + pid_t pid;
  468 +
  469 + /* Start forking */
  470 + if ((pid = fork()) < 0) {
  471 + print_sys_error("fork");
  472 + exit(1);
  473 + }
  474 + if (pid > 0)
  475 + exit(0); /* parent */
  476 +
  477 + /* First child process */
  478 + setsid(); /* become session leader */
  479 +
  480 + if ((pid = fork()) < 0) {
  481 + print_sys_error("fork");
  482 + exit(1);
  483 + }
  484 + if (pid > 0) /* first child */
  485 + exit(0);
  486 +
  487 + chdir("/");
  488 + umask(022);
  489 +}
  490 +
  491 +/*
  492 + * Create separate processes ("virtual processors"). Since it's just an
  493 + * example, there is no watchdog - the parent just exits leaving children
  494 + * on their own.
  495 + */
  496 +static void set_concurrency(int nproc)
  497 +{
  498 + pid_t pid;
  499 + int i;
  500 +
  501 + if (nproc < 1)
  502 + nproc = 1;
  503 +
  504 + for (i = 0; i < nproc; i++) {
  505 + if ((pid = fork()) < 0) {
  506 + print_sys_error("fork");
  507 + exit(1);
  508 + }
  509 + /* Child returns */
  510 + if (pid == 0)
  511 + return;
  512 + }
  513 +
  514 + /* Parent just exits */
  515 + exit(0);
  516 +}
  517 +
  518 +static int cpu_count(void)
  519 +{
  520 + int n;
  521 +
  522 +#if defined (_SC_NPROCESSORS_ONLN)
  523 + n = (int) sysconf(_SC_NPROCESSORS_ONLN);
  524 +#elif defined (_SC_NPROC_ONLN)
  525 + n = (int) sysconf(_SC_NPROC_ONLN);
  526 +#elif defined (HPUX)
  527 +#include <sys/mpctl.h>
  528 + n = mpctl(MPC_GETNUMSPUS, 0, 0);
  529 +#else
  530 + n = -1;
  531 + errno = ENOSYS;
  532 +#endif
  533 +
  534 + return n;
  535 +}
  536 +
  537 +static void print_sys_error(const char *msg)
  538 +{
  539 + fprintf(stderr, "%s: %s: %s\n", prog, msg, strerror(errno));
  540 +}
  541 +