Provenance Best Practices

From ALPS
Jump to: navigation, search

During the ETH Provenance Challenge we identify some "Best practices" in the production of provenance-rich scientific work.

Minimal requirement:

  • use version control for sources and scripts
    • commit often
    • write descriptive, but concise commit messages
  • store the revision number/repository state
  • store input parameters (incl. random seeds) used to obtain the data
    • when using HDF5 format for data files it's good practice to save all input parameters in a /parameters group
  • create a directory per figure containing relevant scripts
  • store the numbers for the data in the plot in an accompanying text file
  • upload raw output
  • describe the post-processing procedure that turns raw data into plotted values


Additional features:

  • store build information
    • store branch, revision number, build time and node.
    • any data output should have attributes from where this information can be recovered (i.e. headers of text file, or attibutes in hdf5)
  • store runtime settings
    • store command line arguments, runtime and node
  • link figures to evaluation scripts and data
    • if you get the PDF figure, can you go back to the version of code and parameters used in the simulation?

Compiling code with version control information

Compiling code with provenance from Git repository

This is example shows how to add git repository information such as branch and revision into your code. It is easily portable to CMake and Subversion.

Makefile:

BUILDHEADER=/tmp/buildheader.info
BUILDSTAMP="\"`cat ${BUILDHEADER} | head -n 1`\""
FLAGS = -O3 -DBUILD_STAMP=${BUILDSTAMP} 
 
buildheader:
	command -v git >/dev/null 2>&1 &&  echo "Build date" `date +'%y.%m.%d %H:%M:%S'` "NL"  "Branch: " `git rev-parse --abbrev-ref HEAD` "NL" "Hash: " `git rev-parse HEAD` "" > ${BUILDHEADER}
 
program: buildheader
        c++ ${FLAGS} -o program program.cpp

program.cpp:

#include<iostream>
 
int main() {
    std::cout << "Save the macro BUILD_STAMP with your data." << std::endl;
    std::cout << BUILD_STAMP << std::endl;
    return 0;
}

Note: It might require to be compiled with C++11 - I don't remember whether it works with 03.

Subversion revision number in CMake

CMakeLists.txt:

set(MYPROJECT_VERSION_BUILD "")
find_package(Subversion) 
if(Subversion_FOUND)
  # get the Subversion info
  Subversion_WC_INFO(${PROJECT_SOURCE_DIR} MYPROJECT)
  # (optional) extract the branch path from the full url
  string(REPLACE ${MYPROJECT_WC_ROOT} "" MYPROJECT_BRANCH ${MYPROJECT_WC_URL})
  # combine revision number and branch path
  set(MYPROJECT_VERSION_BUILD "r${MYPROJECT_WC_REVISION} (${MYPROJECT_BRANCH})")
endif(Subversion_FOUND) 
 
# configure a C++ header file. build revision will then be available as a macro.
configure_file(version.hpp.in ${CMAKE_BINARY_DIR}/version.hpp)

version.hpp.in:

#ifndef MYPROJECT_VERSION_HPP
#define MYPROJECT_VERSION_HPP
 
#cmakedefine MYPROJECT_VERSION_BUILD "@MYPROJECT_VERSION_BUILD@"
 
#endif

Collecting build information with a post-build script

We can instruct cmake to run a script at the end of a build:

CMakeLists.txt:

...
add_custom_command(TARGET executable_name POST_BUILD
    COMMAND "${PYTHON_INTERPRETER}" 
    ARGS "${CMAKE_SOURCE_DIR}/buildlog.py" "${CMAKE_SOURCE_DIR}" "${CMAKE_CURRENT_BINARY_DIR}" "${ALPS_DIR}" "executable_name"
    COMMENT "Generating build log for executable_name" VERBATIM)

The following is an example Python script collecting rather extensive information including subversion status, linked libraries and alps configuration. The information will be saved alongside the binary and can be copied to the data at run time.

buildlog.py

from sys import argv
from os.path import *
import platform
from datetime import datetime
from socket import gethostname
from subprocess import * 
import re
 
srcdir   = abspath(str(argv[1]))
builddir = abspath(str(argv[2]))
alpsdir  = abspath(str(argv[3]))
bin      = abspath(str(argv[4]))
 
of = bin+'.log'
svninfo = check_output(['svn','info',srcdir])
svnstat = check_output(['svn','st','-v',srcdir])
svndiff = check_output(['svn','di',srcdir])
 
if platform.system() == 'Darwin':
    ldd = check_output(['otool','-L',bin])
else:
    ldd = check_output(['ldd',bin])
 
# Write build info
f = open( of, 'w' )
f.write( 'Target '+str(bin)+' built '+str( datetime.now() )+' on '+gethostname()+'.\n' )
f.write( 'source dir:  ' + abspath(srcdir  ) +'\n')
f.write( 'build dir:   ' + abspath(builddir) +'\n')
f.write( 80*'='+'\n' )
f.write( '+++ svn info +++\n'+svninfo)
f.write( 80*'='+'\n' )
f.write( '+++ svn stat +++\n'+svnstat)
f.write( 80*'='+'\n' )
if svndiff:
    f.write( '+++ svn diff +++\n'+svndiff )
    f.write( 80*'='+'\n' )
# module list
try:
    f.write('+++ module list +++\n')
    mods = check_output(['modulecmd','sh','list'],stderr=STDOUT)
    f.write(mods)
    f.write(80*'='+'\n' )
except:
    pass
f.flush()
f.write( '+++ dynamic libraries +++\n'+ldd)
f.write( 80*'='+'\n' )
with open(join(builddir,'CMakeCache.txt'),'r') as cc:
    discard = re.compile("(^//|^#|.*:INTERNAL=|^\s*$)") # discard comments, INTERNAL cache entries and empty lines
    f.write('+++ CMakeCache.txt +++\n')
    for l in cc:
        if not discard.match(l): f.write(l)
    f.write( 80*'='+'\n' )
with open(join(alpsdir,'ALPSConfig.cmake'),'r') as cc:
    discard = re.compile("(^#|^\s*$)") # discard comments and empty lines
    f.write('+++ ALPSConfig.cmake +++\n')
    for l in cc:
        if not discard.match(l): f.write(l)
    f.write( 80*'='+'\n' )
f.close()

Collecting run time information with a run script

Instead of calling the simulation executable directly we can run it via a wrapper script that produces a provenance-rich execution log. The following is an example script that can be used for running a binary taking zero or more command-line arguments just by prefixing the normal command line with the run script.

$ run.py simulation_executable [simulation arguments]

run.py

#! /usr/bin/env python
from sys import argv, stdout, exit
from os import getcwd
from os.path import *
from datetime import datetime
from socket import gethostname
from subprocess import * 
 
if len(argv) < 2:
    raise RuntimeError('usage: '+argv[0]+' bin [args]')
bin  = abspath(str(argv[1]))
args = argv[2:]
log  = stdout
 
# Copy build info to log (see "Collecting build information")
binfo = bin+'.log'
with open(binfo,'r') as bi:
    log.write(bi.read())
 
# Collect run info
cmd = [bin]+args
log.write('running: '+str(cmd)+'\n')
log.write('host:  '+gethostname()+'\n')
log.write('cwd:   '+getcwd()+'\n')
log.write('start: '+str(datetime.now())+'\n')
log.write(80*'='+'\n' )
log.flush()
 
# module list
try:
    mods = check_output(['modulecmd','sh','list'],stderr=STDOUT)
    log.write('+++ module list +++\n')
    log.write(mods)
    log.write(80*'='+'\n' )
except:
    pass
log.flush()
 
# run cmd, copying output to log file
rc = call(cmd,stdout=log,stderr=log)
log.write(80*'='+'\n' )
if rc:
    log.write('FAILED with return code '+str(rc)+'\n')
log.write('stop: '+str(datetime.now())+'\n')
 
exit(rc)