crash test - cope with more complexity around re-starting.

With faster starting spare kits we can end up waiting for a state
where there are no live or spare kits, but the spare kit has already
beaten us to the punch:

[testCrashKit] (+1124ms): Killing coolkit instances.| httpcrashtest.cpp:151
[killPid] (+1124ms): Killing 1650047| httpcrashtest.cpp:257
[killPid] (+1124ms): Killing 1650071| httpcrashtest.cpp:257
[waitForKitProcessCount] (+1124ms): Waiting for kit process count: Doc Kits: 0  Spare Kits: 0 | KitPidHelpers.cpp:70
[waitForKitProcessCount] (+1124ms): Current kit processes: Doc Kits: [1650047, ] Spare Kits: [1650071, ]| KitPidHelpers.cpp:80
...
Forking a coolkit process with jailId: 09X67pOy1HAgSk9G as spare coolkit #5.| kit/ForKit.cpp:406
...
[waitForKitProcessCount] (+1558ms): Current kit processes: Doc Kits: [] Spare Kits: [1650083, ]| KitPidHelpers.cpp:94
... fail ...

Avoid this by intersecting before & after to ensure are all before are
dead.

Cleanup exposing the more problematic wait-for-zero-spare method.

Signed-off-by: Michael Meeks <michael.meeks@collabora.com>
Change-Id: I6f0ba87139b58f9bf1770ebbc5cac95b5063679e
This commit is contained in:
Michael Meeks 2024-03-21 16:11:37 +00:00
parent 5355042687
commit 31f1ce360a
3 changed files with 65 additions and 48 deletions

View file

@ -13,6 +13,7 @@
#include <set>
#include <chrono>
#include <iostream>
#include <algorithm>
#include <thread>
#include <string>
@ -58,7 +59,8 @@ void helpers::logKitProcesses(const std::string& testname)
<< " Spare Kits: " << getPidList(spareKitPids));
}
void helpers::waitForKitProcessCount(
namespace {
void waitForKitProcessCount(
const std::string& testname,
int numDocKits,
int numSpareKits /* = -1 */,
@ -69,8 +71,8 @@ void helpers::waitForKitProcessCount(
<< (numDocKits >= 0 ? "Doc Kits: " + std::to_string(numDocKits) + " " : "")
<< (numSpareKits >= 0 ? " Spare Kits: " + std::to_string(numSpareKits) + " " : ""));
std::set<pid_t> docKitPids = getDocKitPids();
std::set<pid_t> spareKitPids = getSpareKitPids();
std::set<pid_t> docKitPids = helpers::getDocKitPids();
std::set<pid_t> spareKitPids = helpers::getSpareKitPids();
bool pass = (numDocKits < 0 || docKitPids.size() == static_cast<size_t>(numDocKits)) &&
(numSpareKits < 0 || spareKitPids.size() == static_cast<size_t>(numSpareKits));
int tries = (timeoutMs / retryMs);
@ -83,8 +85,8 @@ void helpers::waitForKitProcessCount(
{
std::this_thread::sleep_for(retryMs);
docKitPids = getDocKitPids();
spareKitPids = getSpareKitPids();
docKitPids = helpers::getDocKitPids();
spareKitPids = helpers::getSpareKitPids();
pass = (numDocKits < 0 || docKitPids.size() == static_cast<size_t>(numDocKits)) &&
(numSpareKits < 0 || spareKitPids.size() == static_cast<size_t>(numSpareKits));
tries--;
@ -111,6 +113,7 @@ void helpers::waitForKitProcessCount(
LOK_ASSERT_FAIL("Timed out waiting for kit process count: " + oss.str());
}
}
}
void helpers::waitForKitPidsReady(
const std::string& testname,
@ -120,10 +123,50 @@ void helpers::waitForKitPidsReady(
waitForKitProcessCount(testname, 0, 1, timeoutMs, retryMs);
}
void helpers::waitForKitPidsKilled(
const std::string& testname,
const std::chrono::milliseconds timeoutMs /* = KIT_PID_TIMEOUT_MS */,
const std::chrono::milliseconds retryMs /* = KIT_PID_RETRY_MS */)
void helpers::killPid(const std::string& testname, const pid_t pid)
{
waitForKitProcessCount(testname, 0, 0, timeoutMs, retryMs);
TST_LOG("Killing " << pid);
if (kill(pid, SIGKILL) == -1)
TST_LOG("kill(" << pid << ", SIGKILL) failed: " <<
Util::symbolicErrno(errno) << ": " << std::strerror(errno));
}
void helpers::killAllKitProcesses(
const std::string& testname,
const std::chrono::milliseconds timeoutMs /* = COMMAND_TIMEOUT_MS * 8 */,
const std::chrono::milliseconds retryMs /* = 10ms */)
{
auto before = getKitPids();
for (pid_t pid : before)
killPid(testname, pid);
TST_LOG("Waiting for these kit processes to close: " << getPidList(before));
bool pass = false;
int tries = (timeoutMs / retryMs);
std::set<pid_t> inters;
while (tries >= 0 && !pass)
{
std::this_thread::sleep_for(retryMs);
auto current = getKitPids();
inters.clear();
std::set_intersection(
current.begin(), current.end(), before.begin(), before.end(),
std::inserter(inters, inters.begin()));
pass = inters.empty();
tries--;
TST_LOG("Current kit processes: " << getPidList(current));
}
TST_LOG("Finished waiting for all previous kit processes to close"
<< " before: " << getPidList(before)
<< " current: " << getPidList(getKitPids())
<< " intersection: " << getPidList(inters));
LOK_ASSERT_MESSAGE_SILENT("Timed out waiting for these kit processes to close", pass);
}

View file

@ -51,12 +51,9 @@ pid_t getForKitPid();
void logKitProcesses(const std::string& testname);
/*
* Wait until the number of kit processes matches the desired count.
* Set numDocKits and numSpareKits to -1 to skip counting those processes
* SIGKILL relevant pid
*/
void waitForKitProcessCount(const std::string& testname, int numDocKits, int numSpareKits,
const std::chrono::milliseconds timeoutMs,
const std::chrono::milliseconds retryMs);
void killPid(const std::string& testname, const pid_t pid);
/*
* Wait until ready with 0 doc kits and 1 spare kit
@ -69,11 +66,10 @@ void waitForKitPidsReady(
const std::chrono::milliseconds retryMs = std::chrono::milliseconds(KIT_PID_RETRY_MS));
/*
* Wait until all doc and spare kits are closed
* Kill all and wait for previous processes to die
*/
void waitForKitPidsKilled(
const std::string& testname,
const std::chrono::milliseconds timeoutMs = std::chrono::milliseconds(KIT_PID_TIMEOUT_MS),
const std::chrono::milliseconds retryMs = std::chrono::milliseconds(KIT_PID_RETRY_MS));
void killAllKitProcesses(const std::string& testname,
const std::chrono::milliseconds timeoutMs = std::chrono::milliseconds(KIT_PID_TIMEOUT_MS),
const std::chrono::milliseconds retryMs = std::chrono::milliseconds(KIT_PID_RETRY_MS));
} // namespace helpers

View file

@ -64,9 +64,7 @@ class HTTPCrashTest : public CPPUNIT_NS::TestFixture
void testRecoverAfterKitCrash();
void testCrashForkit();
void killPid(const pid_t pid, const std::string& testname);
void killDocKitProcesses(const std::string& testname);
void killAllKitProcesses(const std::string& testname);
public:
HTTPCrashTest()
@ -114,8 +112,7 @@ void HTTPCrashTest::testBarren()
try
{
TST_LOG("Killing all kits");
killAllKitProcesses(testname);
waitForKitPidsKilled(testname);
helpers::killAllKitProcesses(testname);
// Do not wait for spare kit to start up here
@ -150,8 +147,7 @@ void HTTPCrashTest::testCrashKit()
std::this_thread::sleep_for(std::chrono::seconds(1));
TST_LOG("Killing coolkit instances.");
killAllKitProcesses(testname);
waitForKitPidsKilled(testname);
helpers::killAllKitProcesses(testname);
// TST_LOG("Reading the error code from the socket.");
//FIXME: implement in WebSocketSession.
@ -186,7 +182,6 @@ void HTTPCrashTest::testRecoverAfterKitCrash()
TST_LOG("Killing coolkit instances.");
killDocKitProcesses(testname);
waitForKitPidsReady(testname);
TST_LOG("Reconnect after kill.");
std::shared_ptr<http::WebSocketSession> socket2 = loadDocAndGetSession(
@ -222,7 +217,7 @@ void HTTPCrashTest::testCrashForkit()
std::this_thread::sleep_for(std::chrono::seconds(1));
TST_LOG("Killing forkit.");
killPid(getForKitPid(), testname);
helpers::killPid(testname, getForKitPid());
TST_LOG("Communicating after kill.");
sendTextFrame(socket, "status", testname);
@ -234,8 +229,7 @@ void HTTPCrashTest::testCrashForkit()
socket->waitForDisconnection(std::chrono::seconds(5)));
TST_LOG("Killing coolkit.");
killAllKitProcesses(testname);
waitForKitPidsKilled(testname);
helpers::killAllKitProcesses(testname);
// Forkit should restart
waitForKitPidsReady(testname);
@ -255,29 +249,13 @@ void HTTPCrashTest::testCrashForkit()
}
}
void HTTPCrashTest::killPid(const pid_t pid, const std::string& testname)
{
TST_LOG("Killing " << pid);
if (kill(pid, SIGKILL) == -1)
{
TST_LOG("kill(" << pid << ", SIGKILL) failed: " << Util::symbolicErrno(errno) << ": " << std::strerror(errno));
}
}
void HTTPCrashTest::killDocKitProcesses(const std::string& testname)
{
for (pid_t pid : getDocKitPids())
{
killPid(pid, testname);
}
}
void HTTPCrashTest::killAllKitProcesses(const std::string& testname)
{
for (pid_t pid : getKitPids())
{
killPid(pid, testname);
helpers::killPid(testname, pid);
}
waitForKitPidsReady(testname);
}
CPPUNIT_TEST_SUITE_REGISTRATION(HTTPCrashTest);