00001 # Copyright (C) 1997-2004 The CDG Team <cdg@nats.informatik.uni-hamburg.de> 00002 # 00003 # This file is free software; as a special exception the author gives 00004 # unlimited permission to copy and/or distribute it, with or without 00005 # modifications, as long as this notice is preserved. 00006 # 00007 # This program is distributed in the hope that it will be useful, but 00008 # WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 00009 # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00010 00011 ## ---------------------------------------------------------------------------- 00012 ## YadaMachine - bundled information about a yada machine 00013 ## \note Actually this class should be split into a YadaScheduler and a 00014 ## YadaMachine the latter of which only contains the configurational part of 00015 ## the current implementation. As things are we add this class to the YadaScheduler 00016 ## module \a and the YadaConfiguration module, a construction that might be a little 00017 ## wired. 00018 ## \ingroup YadaConfiguration 00019 ## \ingroup YadaScheduler 00020 ## 00021 ## \author Michael Daum 00022 ## 00023 ## $Id: YadaMachine.tcl,v 1.13 2004/10/15 17:24:37 micha Exp $ 00024 ## ---------------------------------------------------------------------------- 00025 class YadaMachine { 00026 inherit YadaConfigItem 00027 00028 # variables ---------------------------------------------------------------- 00029 00030 ## list ip adresses host names making up this machine 00031 public variable nodes {127.0.0.1} 00032 00033 ## overall sum of cpus in all nodes 00034 public variable cpus 1 00035 00036 ## acess manor for this machine. 00037 ## possible values are "local" or any way of getting a shell on the nodes, i.e. 00038 ## \code ssh -x %node \endcode 00039 ## Notice, that the string \c %node is replaced with the actuall node name in 00040 ## YadaMachine::nodes to which a specific connection is to be established. 00041 public variable access "local" 00042 00043 ## list of all YadaJob s queued up to be scheduled on all nodes 00044 private variable _jobQueue "" 00045 00046 ## list of YadaJob s currently running on the nodes 00047 private variable _runQueue "" 00048 00049 ## statistics about the average duration of one YadaJob. 00050 ## The machine updates this figure the more jobs have been seen. 00051 private variable _timePerJob 60000 00052 00053 ## number of jobs run on this machine. 00054 ## This number is related to _timePerJob computing the average duration 00055 ## seen in _noComputedJobs. 00056 private variable _noComputedJobs 10 00057 00058 ## used for profiling jobs on this machine. 00059 ## This hash mapps YadaJob objects to a clock tick. 00060 private variable _timerOfJob; ## \type TclArray 00061 00062 ## hash mapping machine node names to the YadaJob s running on them 00063 private variable _jobsOnNode; ## \type TclArray 00064 00065 # methods ------------------------------------------------------------------ 00066 public method getLoad {} 00067 public method getTimePerJob {} 00068 public method startJob {} 00069 public method stopJob {job}; ## \type YadaJob 00070 public method wait {} 00071 public method queueJob {job}; ## \type YadaJob 00072 public method toDOM {} 00073 public method fromDOM {rootNode}; ## \type domNode 00074 public method getPersistanceFileName {}; ## overriding virtual method 00075 00076 constructor {args} {}; ## \type TclList 00077 destructor {} 00078 00079 private method _getNextNode {} 00080 private method _addTime {time}; ## \type TclNumber 00081 private method _startJob {} 00082 }; 00083 00084 00085 ## ---------------------------------------------------------------------------- 00086 ## constructor 00087 ## ---------------------------------------------------------------------------- 00088 body YadaMachine::constructor {args} { 00089 global env 00090 set name "localhost" 00091 eval configure $args 00092 setModified 00093 } 00094 00095 ## ---------------------------------------------------------------------------- 00096 ## Serializes Object into a DOM-XML-Node 00097 ## @return root DOM node representing the Machine-Object (root tag <yada>) 00098 ## ---------------------------------------------------------------------------- 00099 body YadaMachine::toDOM {} { 00100 set document [dom createDocument "yada"] 00101 set root [$document documentElement] 00102 00103 dom createNodeCmd elementNode machine 00104 dom createNodeCmd elementNode node 00105 00106 $root appendFromScript { 00107 machine -name $name -numCPUs $cpus -access $access { 00108 foreach n $nodes { 00109 if { [regexp {\(([^\)]+)\)} $n match sub] } { 00110 node -name $sub -isActive "false" 00111 } else { 00112 node -name $n -isActive "true" 00113 } 00114 } 00115 } 00116 } 00117 00118 return $root 00119 } 00120 00121 ## ---------------------------------------------------------------------------- 00122 ## Initialize Object with DOM node (that has saved the object state) 00123 ## @param rootNode root DOM-Node (Tag <yada>) 00124 ## ---------------------------------------------------------------------------- 00125 body YadaMachine::fromDOM {rootNode} { 00126 set xpath "/yada/machine" 00127 00128 set nodeMachine [$rootNode selectNodes $xpath] 00129 set name [$nodeMachine getAttribute name] 00130 00131 set nodeNodes [$rootNode selectNodes ${xpath}/node] 00132 set nodes {} 00133 foreach n $nodeNodes { 00134 set activeState [$n getAttribute "isActive"] 00135 00136 if { [string compare $activeState "true" ] == 0 } { 00137 lappend nodes [$n getAttribute "name"] 00138 } else { 00139 lappend nodes "([$n getAttribute name])" 00140 } 00141 } 00142 00143 set access [$nodeMachine getAttribute access] 00144 set cpus [$nodeMachine getAttribute numCPUs] 00145 } 00146 00147 00148 ## ---------------------------------------------------------------------------- 00149 ## waits for a cpu to become free. Calling wait will guarantee that no more 00150 ## jobs are running than cpus are in the cluster. Execution will be suspendend 00151 ## waiting for the _runQueue to change. 00152 ## ---------------------------------------------------------------------------- 00153 body YadaMachine::wait {} { 00154 while {1} { 00155 if {[llength $_runQueue] < $cpus} { 00156 return 00157 } 00158 vwait [scope _runQueue] 00159 } 00160 } 00161 00162 ## ---------------------------------------------------------------------------- 00163 ## queueJob 00164 ## ---------------------------------------------------------------------------- 00165 body YadaMachine::queueJob {job} { 00166 lappend _jobQueue $job 00167 $job configure -machine $this 00168 } 00169 00170 ## ---------------------------------------------------------------------------- 00171 ## _getNextNode 00172 ## This returns the next best node on the cluster, that is the one with 00173 ## the least jobs running on it. 00174 ## ---------------------------------------------------------------------------- 00175 body YadaMachine::_getNextNode {} { 00176 set maxNodes [llength [array names _jobsOnNode]] 00177 00178 if {$maxNodes < 1} { 00179 error "ERROR: invalid number $maxNodes of nodes in machine $name" 00180 } 00181 00182 set minNode "" 00183 set minJobs 9999999 00184 # calculate the minimal node in the cluster 00185 foreach node [array names _jobsOnNode] { 00186 if {$_jobsOnNode($node) < $minJobs} { 00187 set minNode $node 00188 set minJobs $_jobsOnNode($node) 00189 } 00190 } 00191 00192 return $minNode 00193 } 00194 00195 ## ---------------------------------------------------------------------------- 00196 ## getLoad 00197 ## ---------------------------------------------------------------------------- 00198 body YadaMachine::getLoad {} { 00199 if {$cpus == 0} { 00200 return 0.0 00201 } else { 00202 return [expr [llength $_runQueue] / (0.0 + $cpus)] 00203 } 00204 } 00205 00206 ## ---------------------------------------------------------------------------- 00207 ## startJob 00208 ## schedule jobs 00209 ## ---------------------------------------------------------------------------- 00210 body YadaMachine::startJob {} { 00211 set n 0 00212 00213 if {$_jobQueue != ""} { 00214 for {set i [expr $cpus - [llength $_runQueue]]} {$i > 0} {incr i -1} { 00215 after idle [code $this _startJob] 00216 incr n 00217 } 00218 update 00219 } 00220 00221 return $n 00222 } 00223 00224 ## ---------------------------------------------------------------------------- 00225 ## _startJob 00226 ## remove the next YadaJob from the queue and run it 00227 ## ---------------------------------------------------------------------------- 00228 body YadaMachine::_startJob {} { 00229 if {$_jobQueue == ""} { 00230 return 00231 } 00232 00233 # remove the next job 00234 set job [lindex $_jobQueue 0] 00235 $job setState "dequeuing" 00236 set _jobQueue [lreplace $_jobQueue 0 0] 00237 set _timerOfJob($job) [clock seconds] 00238 lappend _runQueue $job 00239 00240 # build a process for that job 00241 set experiment [$job cget -experiment] 00242 set runner [$job cget -runner] 00243 set cdgp [$experiment cget -command] 00244 set wordgraphName [$job cget -wordgraphName] 00245 set logFile [$experiment getLogFileName $wordgraphName] 00246 set xmlFile [$experiment getXmlFileName $wordgraphName] 00247 set printCommand [code $runner print] 00248 set consumerCommand [$job cget -consumerCommand] 00249 00250 # check file existence 00251 if {[file exists $xmlFile]} { 00252 stopJob $job 00253 return 00254 } 00255 00256 # mark the wordgraph to be active 00257 $runner markWordgraph $wordgraphName ;# unmarked in the consumer 00258 00259 set process [CdgProcess ::#auto \ 00260 -printCommand $printCommand \ 00261 -cdgp $cdgp \ 00262 -logFile $logFile \ 00263 -xmlFile $xmlFile \ 00264 -consumerCommand $consumerCommand \ 00265 -job $job \ 00266 -sync 1 \ 00267 -initFile none ] 00268 00269 $job configure -process [scope $process] 00270 00271 # build the command with regard to the access experiment 00272 if {$access == "local"} { 00273 set command $cdgp 00274 } else { 00275 set newNode [_getNextNode] 00276 $job configure -node $newNode 00277 incr _jobsOnNode($newNode) 00278 00279 if {[regsub -all -- "%node" $access $newNode tmpAccess]} { 00280 set command "$tmpAccess $cdgp" 00281 } else { 00282 set command "$access $cdgp" 00283 } 00284 00285 # show scheduling 00286 if {0} { 00287 foreach node [lsort [array names _jobsOnNode]] { 00288 if {$node == $newNode} { 00289 puts -nonewline "*$node $_jobsOnNode($node) " 00290 } else { 00291 puts -nonewline " $node $_jobsOnNode($node) " 00292 } 00293 } 00294 puts "" 00295 } 00296 } 00297 00298 00299 $job setState "running" 00300 $process start $command 00301 $experiment start $job 00302 00303 # CAUTION: no more code here 00304 00305 } 00306 00307 ## ---------------------------------------------------------------------------- 00308 ## stopJob 00309 ## ---------------------------------------------------------------------------- 00310 body YadaMachine::stopJob {job} { 00311 if {[$job hasPassedState "stopping"]} { 00312 return 00313 } 00314 $job setState "stopping" 00315 00316 if {[info exists _timerOfJob($job)]} { 00317 set duration [expr ([clock seconds] - $_timerOfJob($job)) * 1000] 00318 _addTime $duration 00319 unset _timerOfJob($job) 00320 } 00321 00322 # remove from job queue 00323 set index [lsearch -exact $_jobQueue $job] 00324 if {$index >= 0} { 00325 set _jobQueue [lreplace $_jobQueue $index $index] 00326 } 00327 00328 # remove from run queue 00329 set index [lsearch -exact $_runQueue $job] 00330 if {$index >= 0} { 00331 set _runQueue [lreplace $_runQueue $index $index] 00332 } 00333 00334 # count down jobs per node 00335 set node [$job cget -node] 00336 if {$node != ""} { 00337 catch { 00338 incr _jobsOnNode($node) -1 00339 if {$_jobsOnNode($node) < 0} { 00340 set _jobsOnNode($node) 0 00341 #puts "ERROR: negative job count on node $node" 00342 } 00343 } 00344 } 00345 00346 itcl::delete object $job 00347 } 00348 00349 ## ---------------------------------------------------------------------------- 00350 ## destructor 00351 ## ---------------------------------------------------------------------------- 00352 body YadaMachine::destructor {} { 00353 foreach job "$_jobQueue $_runQueue" { 00354 stopJob $job 00355 } 00356 } 00357 00358 ## ---------------------------------------------------------------------------- 00359 ## update machine statistics. 00360 ## This machine adds the given time to the internal timer statistics. 00361 ## Incremental averaging is computed like this: 00362 ## - \f$x1 \f$ 00363 ## - \f$ x_1 * 1/2 + x_2/2 = (x_1 + x_2)/2 \f$ 00364 ## - \f$ (x_1 * 1/2 + x_2/2) * 2/3 + x_3/3 = (x_1 + x_2 + x_3)/3 \f$ 00365 ## - \f$ ((x_1 * 1/2 + x_2/2) * 2/3 + x_3/3) * 3/4 + x4/4 = (x_1 + x_2 + x_3 + x_4)/4 \f$ 00366 ## - ... 00367 ## \param time a temporal span 00368 ## \returns the computed average time per job as calculated for this machine. 00369 ## ---------------------------------------------------------------------------- 00370 body YadaMachine::_addTime {time} { 00371 if {$time <= 0} { 00372 return $_timePerJob 00373 } 00374 00375 incr _noComputedJobs 00376 00377 set _timePerJob \ 00378 [expr $_timePerJob * (($_noComputedJobs -1.0) / ($_noComputedJobs + 0.0)) \ 00379 + $time / ($_noComputedJobs + 0.0)] 00380 00381 return $_timePerJob 00382 } 00383 00384 ## ---------------------------------------------------------------------------- 00385 ## getTimePerJob 00386 ## ---------------------------------------------------------------------------- 00387 body YadaMachine::getTimePerJob {} { 00388 return $_timePerJob 00389 } 00390 00391 ## ---------------------------------------------------------------------------- 00392 ## callback whenever the -nodes attribute is set 00393 ## This a registers known nodes of a cluster by setting the number of 00394 ## running jobs in _jobsOnNode to 0. Formerly known nodes that aren't found 00395 ## in $nodes any more are removed from that array. 00396 ## ---------------------------------------------------------------------------- 00397 configbody YadaMachine::nodes { 00398 00399 # delete removed nodes 00400 foreach node [array names _jobsOnNode] { 00401 if {[lsearch $nodes $node] < 0} { 00402 unset _jobsOnNode($node) 00403 } 00404 } 00405 00406 00407 # add new nodes 00408 set currentNodes [lsort $nodes] 00409 set nodes "" 00410 foreach node $currentNodes { 00411 # exclude disabled nodes 00412 if {[regexp {\(.*\)} $node]} { 00413 catch {unset _jobsOnNode($node)} 00414 lappend nodes $node 00415 continue 00416 } 00417 00418 # test reachability 00419 if {[catch "exec fping -t100 -q $node >/dev/null"]} { 00420 catch {unset _jobsOnNode($node)} 00421 lappend nodes "($node)" 00422 continue; 00423 } else { 00424 lappend nodes $node 00425 } 00426 00427 if {![info exists _jobsOnNode($node)]} { 00428 set _jobsOnNode($node) 0 00429 } 00430 } 00431 } 00432 00433 ## ---------------------------------------------------------------------------- 00434 ## Getting unique file name from registry 00435 ## ---------------------------------------------------------------------------- 00436 body YadaMachine::getPersistanceFileName {} { 00437 return [.main getPersistanceFileNameForMachine $name] 00438 } 00439 00440 00441 00442 00443