Skip to content

Commit 8f6ab9e

Browse files
authored
Fix provider status tracking bugs (akash-network#112)
* Add timeout to grpc calls * Move snapshot saving logic to providerStatusProvider * Cap available quantities to 0 to prevent negative numbers * Cleanup * Rename function * Use bulk create for gpus&cpus * Fetch status of recently online providers first
1 parent 66f2870 commit 8f6ab9e

File tree

4 files changed

+300
-276
lines changed

4 files changed

+300
-276
lines changed
+146-82
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import https from "https";
22
import axios from "axios";
33
import semver from "semver";
4-
import { Provider } from "@shared/dbSchemas/akash";
4+
import { Provider, ProviderSnapshotNode, ProviderSnapshotNodeCPU, ProviderSnapshotNodeGPU } from "@shared/dbSchemas/akash";
55
import { asyncify, eachLimit } from "async";
66
import { ProviderSnapshot } from "@src/../../shared/dbSchemas/akash/providerSnapshot";
7-
import { fetchAndSaveProviderStats as grpcFetchAndSaveProviderStats } from "./statusEndpointHandlers/grpc";
8-
import { fetchAndSaveProviderStats as restFetchAndSaveProviderStats } from "./statusEndpointHandlers/rest";
7+
import { sequelize } from "@src/db/dbConnection";
8+
import { toUTC } from "@src/shared/utils/date";
9+
import { ProviderStatusInfo, ProviderVersionEndpointResponseType } from "./statusEndpointHandlers/types";
10+
import { isSameDay } from "date-fns";
11+
import { fetchProviderStatusFromGRPC } from "./statusEndpointHandlers/grpc";
12+
import { fetchProviderStatusFromREST } from "./statusEndpointHandlers/rest";
913

1014
const ConcurrentStatusCall = 10;
1115
const StatusCallTimeout = 10_000; // 10 seconds
@@ -16,7 +20,10 @@ export async function syncProvidersInfo() {
1620
deletedHeight: null
1721
},
1822
include: [{ model: ProviderSnapshot, as: "lastSnapshot" }],
19-
order: [["isOnline", "DESC"]]
23+
order: [
24+
["isOnline", "DESC"],
25+
["uptime30d", "DESC"]
26+
]
2027
});
2128

2229
const httpsAgent = new https.Agent({
@@ -28,103 +35,160 @@ export async function syncProvidersInfo() {
2835
providers,
2936
ConcurrentStatusCall,
3037
asyncify(async (provider: Provider) => {
38+
let providerStatus: ProviderStatusInfo | null = null;
39+
let errorMessage: string | null = null;
40+
let akashVersion: string | null = null;
41+
let cosmosVersion: string | null = null;
42+
3143
try {
3244
const versionResponse = await axios.get<ProviderVersionEndpointResponseType>(provider.hostUri + "/version", {
3345
httpsAgent: httpsAgent,
3446
timeout: StatusCallTimeout
3547
});
3648

37-
const akashVersion = semver.valid(versionResponse.data.akash.version);
38-
const cosmosVersion = semver.valid(
49+
akashVersion = semver.valid(versionResponse.data.akash.version);
50+
cosmosVersion = semver.valid(
3951
"cosmosSdkVersion" in versionResponse.data.akash ? versionResponse.data.akash.cosmosSdkVersion : versionResponse.data.akash.cosmos_sdk_version
4052
);
4153

4254
if (akashVersion && semver.gte(akashVersion, "0.5.0-0")) {
43-
await grpcFetchAndSaveProviderStats(provider, cosmosVersion, akashVersion, StatusCallTimeout);
55+
providerStatus = await fetchProviderStatusFromGRPC(provider, StatusCallTimeout);
4456
} else {
45-
await restFetchAndSaveProviderStats(provider, cosmosVersion, akashVersion, StatusCallTimeout);
57+
providerStatus = await fetchProviderStatusFromREST(provider, StatusCallTimeout);
4658
}
4759
} catch (err) {
48-
const checkDate = new Date();
49-
const errorMessage = err?.message?.toString() ?? err?.toString();
60+
errorMessage = err?.message?.toString() ?? err?.toString();
61+
}
5062

51-
await Provider.update(
52-
{
53-
isOnline: false,
54-
lastCheckDate: checkDate,
55-
error: errorMessage,
56-
akashVersion: null,
57-
cosmosSdkVersion: null,
58-
deploymentCount: null,
59-
leaseCount: null,
60-
activeCPU: null,
61-
activeGPU: null,
62-
activeMemory: null,
63-
activeStorage: null,
64-
pendingCPU: null,
65-
pendingGPU: null,
66-
pendingMemory: null,
67-
pendingStorage: null,
68-
availableCPU: null,
69-
availableGPU: null,
70-
availableMemory: null,
71-
availableStorage: null
72-
},
73-
{
74-
where: { owner: provider.owner }
75-
}
76-
);
63+
await saveProviderStatus(provider, providerStatus, akashVersion, cosmosVersion, errorMessage);
7764

78-
await ProviderSnapshot.create({
79-
owner: provider.owner,
80-
isOnline: false,
81-
error: errorMessage,
82-
checkDate: checkDate
83-
});
84-
} finally {
85-
doneCount++;
86-
console.log("Fetched provider info: " + doneCount + " / " + providers.length);
87-
}
65+
doneCount++;
66+
console.log("Fetched provider info: " + doneCount + " / " + providers.length);
8867
})
8968
);
9069

9170
console.log("Finished refreshing provider infos");
9271
}
9372

94-
type ProviderVersionEndpointResponseType =
95-
| {
96-
akash: { version: string; commit: string; buildTags: string; go: string; cosmosSdkVersion: string };
97-
kube: {
98-
major: string;
99-
minor: string;
100-
gitVersion: string;
101-
gitCommit: string;
102-
gitTreeState: string;
103-
buildDate: string;
104-
goVersion: string;
105-
compiler: string;
106-
platform: string;
107-
};
73+
async function saveProviderStatus(
74+
provider: Provider,
75+
providerStatus: ProviderStatusInfo | null,
76+
akashVersion: string | null,
77+
cosmosVersion: string | null,
78+
error: string | null
79+
) {
80+
await sequelize.transaction(async (t) => {
81+
const checkDate = toUTC(new Date());
82+
83+
const createdSnapshot = await ProviderSnapshot.create(
84+
{
85+
owner: provider.owner,
86+
isOnline: !!providerStatus,
87+
isLastOfDay: true,
88+
error: error,
89+
checkDate: checkDate,
90+
deploymentCount: providerStatus?.resources.deploymentCount,
91+
leaseCount: providerStatus?.resources.leaseCount,
92+
activeCPU: providerStatus?.resources.activeCPU,
93+
activeGPU: providerStatus?.resources.activeGPU,
94+
activeMemory: providerStatus?.resources.activeMemory,
95+
activeStorage: providerStatus?.resources.activeStorage,
96+
pendingCPU: providerStatus?.resources.pendingCPU,
97+
pendingGPU: providerStatus?.resources.pendingGPU,
98+
pendingMemory: providerStatus?.resources.pendingMemory,
99+
pendingStorage: providerStatus?.resources.pendingStorage,
100+
availableCPU: providerStatus?.resources.availableCPU,
101+
availableGPU: providerStatus?.resources.availableGPU,
102+
availableMemory: providerStatus?.resources.availableMemory,
103+
availableStorage: providerStatus?.resources.availableStorage
104+
},
105+
{ transaction: t }
106+
);
107+
108+
if (provider.lastSnapshot && isSameDay(provider.lastSnapshot.checkDate, checkDate)) {
109+
await ProviderSnapshot.update(
110+
{
111+
isLastOfDay: false
112+
},
113+
{
114+
where: { id: provider.lastSnapshot.id },
115+
transaction: t
116+
}
117+
);
108118
}
109-
| {
110-
akash: {
111-
name: string;
112-
server_name: string;
113-
version: string;
114-
commit: string;
115-
build_tags: string;
116-
go: string;
117-
cosmos_sdk_version: string;
118-
};
119-
kube: {
120-
major: string;
121-
minor: string;
122-
gitVersion: string;
123-
gitCommit: string;
124-
gitTreeState: string;
125-
buildDate: string;
126-
goVersion: string;
127-
compiler: string;
128-
platform: string;
129-
};
130-
};
119+
120+
await Provider.update(
121+
{
122+
lastSnapshotId: createdSnapshot.id,
123+
isOnline: !!providerStatus,
124+
error: error,
125+
lastCheckDate: checkDate,
126+
cosmosSdkVersion: cosmosVersion,
127+
akashVersion: akashVersion,
128+
deploymentCount: providerStatus?.resources.deploymentCount,
129+
leaseCount: providerStatus?.resources.leaseCount,
130+
activeCPU: providerStatus?.resources.activeCPU,
131+
activeGPU: providerStatus?.resources.activeGPU,
132+
activeMemory: providerStatus?.resources.activeMemory,
133+
activeStorage: providerStatus?.resources.activeStorage,
134+
pendingCPU: providerStatus?.resources.pendingCPU,
135+
pendingGPU: providerStatus?.resources.pendingGPU,
136+
pendingMemory: providerStatus?.resources.pendingMemory,
137+
pendingStorage: providerStatus?.resources.pendingStorage,
138+
availableCPU: providerStatus?.resources.availableCPU,
139+
availableGPU: providerStatus?.resources.availableGPU,
140+
availableMemory: providerStatus?.resources.availableMemory,
141+
availableStorage: providerStatus?.resources.availableStorage
142+
},
143+
{
144+
where: { owner: provider.owner },
145+
transaction: t
146+
}
147+
);
148+
149+
if (providerStatus) {
150+
for (const node of providerStatus.nodes) {
151+
const providerSnapshotNode = await ProviderSnapshotNode.create(
152+
{
153+
snapshotId: createdSnapshot.id,
154+
name: node.name,
155+
cpuAllocatable: node.cpuAllocatable,
156+
cpuAllocated: node.cpuAllocated,
157+
memoryAllocatable: node.memoryAllocatable,
158+
memoryAllocated: node.memoryAllocated,
159+
ephemeralStorageAllocatable: node.ephemeralStorageAllocatable,
160+
ephemeralStorageAllocated: node.ephemeralStorageAllocated,
161+
capabilitiesStorageHDD: node.capabilitiesStorageHDD,
162+
capabilitiesStorageSSD: node.capabilitiesStorageSSD,
163+
capabilitiesStorageNVME: node.capabilitiesStorageNVME,
164+
gpuAllocatable: node.gpuAllocatable,
165+
gpuAllocated: node.gpuAllocated
166+
},
167+
{ transaction: t }
168+
);
169+
170+
await ProviderSnapshotNodeCPU.bulkCreate(
171+
node.cpus.map((cpuInfo) => ({
172+
snapshotNodeId: providerSnapshotNode.id,
173+
vendor: cpuInfo.vendor,
174+
model: cpuInfo.model,
175+
vcores: cpuInfo.vcores
176+
})),
177+
{ transaction: t }
178+
);
179+
180+
await ProviderSnapshotNodeGPU.bulkCreate(
181+
node.gpus.map((gpuInfo) => ({
182+
snapshotNodeId: providerSnapshotNode.id,
183+
vendor: gpuInfo.vendor,
184+
name: gpuInfo.name,
185+
modelId: gpuInfo.modelId,
186+
interface: gpuInfo.interface,
187+
memorySize: gpuInfo.memorySize
188+
})),
189+
{ transaction: t }
190+
);
191+
}
192+
}
193+
});
194+
}

0 commit comments

Comments
 (0)