Allen CI timeouts due to slow startup
Recently we noticed quite a few timeouts in the Allen CI. There is no obvious relation to a MR being merged.
I tried to understand what's wrong for https://gitlab.cern.ch/lhcb/Allen/-/jobs/32591605 which ran on n4051701 (tag epyc7502
).
The command line of the process that was "stuck" (using 100% on one cpu) is
./Allen -t 64 --params external/ParamFiles/ --sequence hlt1_pp_veloSP.json --mdf /scratch/allen_data/mdf_input/upgrade_mc_minbias_scifi_v5_000_newLHCbID_new_UT_geometry.mdf -g /scratch/allen_geometries/geometry_dddb-20180815_sim-20180530-vc-md100_new_UT_geometry -n 100 -m 100 -r 100
Attaching GDB and breaking a few times always gets the following stack trace
#8 Scheduler::calculate_lifetime_dependencies(std::vector<ConfiguredAlgorithmArguments, std::allocator<ConfiguredAlgorithmArguments> > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > const&, std::vector<ConfiguredArgument, std::allocator<ConfiguredArgument> > const&, std::vector<Allen::TypeErasedAlgorithm, std::allocator<Allen::TypeErasedAlgorithm> > const&)::{lambda(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, auto:1 const&)#2}::operator()<std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > >(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > > > > const&) const (args=..., arg=...,
__closure=<synthetic pointer>) at ../stream/gear/include/Scheduler.cuh:52
#9 Scheduler::calculate_lifetime_dependencies (sequence_arguments=..., argument_dependencies=..., configured_arguments=..., sequence=..., this=0x5c37720)
at ../stream/gear/include/Scheduler.cuh:94
#10 0x00007f6632907364 in Scheduler::Scheduler (this=0x5c37720, configuration=..., param_do_print=<optimized out>, device_requested_mb=100, required_memory_alignment=64)
at ../stream/gear/include/Scheduler.cuh:157
#11 0x00007f66329015d6 in Stream::Stream (this=this@entry=0x685a930, configuration=..., param_do_print_memory_manager=param_do_print_memory_manager@entry=false,
reserve_mb=reserve_mb@entry=100, required_memory_alignment=64, param_constants=..., buffers_manager=0x1eeade0) at ../stream/sequence/src/Stream.cpp:28
#12 0x00007f6632343646 in allen (options=..., config=..., updater=updater@entry=0x7fff09a9c2e0, input_provider=..., output_handler=<optimized out>, zmqSvc=<optimized out>,
control_connection=...) at /cvmfs/lhcb.cern.ch/lib/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/include/c++/12.1.0/bits/unique_ptr.h:191
#13 0x00000000004040d6 in main (argc=17, argv=0x7fff09a9c468) at /cvmfs/lhcb.cern.ch/lib/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/include/c++/12.1.0/bits/stl_tree.h:211
It apparently takes quite a bit of time to run Scheduler::calculate_lifetime_dependencies
and then we do this for every stream (64 times).
@dcampora could you please check?
Edited by Rosen Matev