Skip to content

Commit

Permalink
Pause group after 5 consecutive lost WUs. #305, Ensure WU is saved to…
Browse files Browse the repository at this point in the history
… DB in case shutdown Windows kills the process. #290
  • Loading branch information
jcoffland committed Nov 13, 2024
1 parent c2cf3d7 commit 6802f20
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 18 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Folding@home Client Changelog
=============================

## v8.4.7
- Pause group after 5 consecutive lost WUs. #305
- Ensure WU is saved to DB in case shutdown Windows kills the process. #290

## v8.4.6
- Attempt more graceful shutdown in Windows.
- Increase clock skew detection threshold from 15s to 5m.
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "fah-client",
"version": "8.4.6",
"version": "8.4.7",
"bin": {"fah-client": "./fah-client"},
"author": "Joseph Coffland <[email protected]>",
"homepage": "https://foldingathome.org/",
Expand Down
33 changes: 25 additions & 8 deletions src/fah/client/Group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ Group::Units Group::units() const {return Units(*app.getUnits(), name);}


void Group::setState(const JSON::Value &msg) {
bool wasPaused = config->getPaused();
config->setState(msg);
if (wasPaused && !config->getPaused()) clearErrors();
triggerUpdate();
}

Expand Down Expand Up @@ -115,14 +117,30 @@ void Group::shutdown(function<void ()> cb) {
}


void Group::unitComplete(bool success) {
if (success) {
failures = 0;
setWait(0);
void Group::clearErrors() {
lostWUs = 0;
failures = 0;
setWait(0);
insert("failed_wus", 0);
insert("lost_wus", 0);
insert("failed", "");
}


} else {
failures++;
void Group::unitComplete(bool success, bool downloaded) {
if (success) clearErrors();
else {
insert("failed_wus", ++failures);
setWait(pow(2, std::min(failures, 10U)));

if (downloaded) {
insert("lost_wus", ++lostWUs);

if (4 < lostWUs) {
insert("failed", "Paused due too many failed Work Units.");
config->setPaused(true);
}
}
}

triggerUpdate();
Expand Down Expand Up @@ -178,8 +196,7 @@ void Group::update() {
}

// No further action if paused or idle
if (config->getPaused() || waitForIdle())
return setWait(0); // Pausing clears wait timer
if (config->getPaused() || waitForIdle()) return;

// Wait on failures
auto now = Time::now();
Expand Down
6 changes: 3 additions & 3 deletions src/fah/client/Group.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ namespace FAH {
cb::SmartPointer<Config> config;

cb::Event::EventPtr event;
uint32_t lostWUs = 0;
uint32_t failures = 0;
uint64_t waitUntil = 0;

Expand Down Expand Up @@ -95,9 +96,7 @@ namespace FAH {
Group(App &app, const std::string &name);

const std::string &getName() const {return name;}

Config &getConfig() const {return *config;}

Units units() const;

void setState(const cb::JSON::Value &msg);
Expand All @@ -109,7 +108,8 @@ namespace FAH {
bool hasUnrunWUs() const;
void triggerUpdate();
void shutdown(std::function<void ()> cb);
void unitComplete(bool success);
void clearErrors();
void unitComplete(bool success, bool downloaded);

void save();
void remove();
Expand Down
4 changes: 2 additions & 2 deletions src/fah/client/Groups.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,12 @@ void Groups::triggerUpdate() {
void Groups::setState(const JSON::Value &msg) {
if (msg.hasString("group")) {
LOG_INFO(1, "Group state " << msg.getString("state"));
getGroup(msg.getString("group")).getConfig().setState(msg);
getGroup(msg.getString("group")).setState(msg);

} else {
LOG_INFO(1, "Machine state " << msg.getString("state"));
for (auto &name: keys())
getGroup(name).getConfig().setState(msg);
getGroup(name).setState(msg);
}
}

Expand Down
9 changes: 5 additions & 4 deletions src/fah/client/Unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ void Unit::dumpWU() {


void Unit::save() {
if (getState() < UNIT_RUN || getState() == UNIT_DONE) return;
if (getState() < UNIT_CORE || getState() == UNIT_DONE) return;

JSON::BufferWriter writer;

Expand Down Expand Up @@ -448,7 +448,8 @@ void Unit::next() {
return monitorRun();
}

return finalizeRun();
finalizeRun();
return save();
}

// Handle pause
Expand Down Expand Up @@ -853,7 +854,7 @@ void Unit::clean(const string &result) {
TRY_CATCH_ERROR(app.getDB("units").unset(id));

setState(UNIT_DONE);
group->unitComplete(success);
group->unitComplete(result == "credited", UNIT_CORE < getState());
}


Expand Down Expand Up @@ -1080,7 +1081,7 @@ void Unit::downloadResponse(const JSON::ValuePtr &data) {

setState(UNIT_CORE);
this->data = data;
save(); // Not strictly necessary
save();
}


Expand Down

0 comments on commit 6802f20

Please sign in to comment.