Skip to content

Commit dc93f2d

Browse files
committed
Refactor IDE spawn logic
- Instead of respawning when there is no join link, respawn when there the PID is dead. - Give more time for the PID to become alive before trying to respawn. - More logging.
1 parent 502e33e commit dc93f2d

File tree

3 files changed

+74
-41
lines changed

3 files changed

+74
-41
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@
44

55
## Unreleased
66

7+
### Changed
8+
9+
- Previously, the plugin would try to respawn the IDE if we fail to get a join
10+
link after five seconds. However, it seems sometimes we do not get a join link
11+
that quickly. Now the plugin will wait indefinitely for a join link as long as
12+
the process is still alive. If the process never comes alive after 30 seconds
13+
or it dies after coming alive, the plugin will attempt to respawn the IDE.
14+
15+
### Added
16+
17+
- Extra logging around the IDE spawn to help debugging.
18+
719
## 2.13.0 - 2024-07-16
820

921
### Added

gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ pluginGroup=com.coder.gateway
44
# Zip file name.
55
pluginName=coder-gateway
66
# SemVer format -> https://semver.org
7-
pluginVersion=2.13.0
7+
pluginVersion=2.13.1
88
# See https://plugins.jetbrains.com/docs/intellij/build-number-ranges.html
99
# for insight into build numbers and IntelliJ Platform versions.
1010
pluginSinceBuild=233.6745

src/main/kotlin/com/coder/gateway/CoderRemoteConnectionHandle.kt

Lines changed: 61 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import java.net.URI
4848
import java.time.Duration
4949
import java.time.LocalDateTime
5050
import java.time.format.DateTimeFormatter
51+
import java.util.concurrent.TimeUnit
5152
import java.util.concurrent.TimeoutException
5253
import kotlin.coroutines.resume
5354
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228

228229
// Wait for the IDE to come up.
229230
indicator.text = "Waiting for ${workspace.ideName} backend..."
230-
var status: UnattendedHostStatus? = null
231231
val remoteProjectPath = accessor.makeRemotePath(ShellArgument.PlainText(workspace.projectPath))
232232
val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233-
while (lifetime.status == LifetimeStatus.Alive) {
234-
status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
235-
if (!status?.joinLink.isNullOrBlank()) {
236-
break
237-
}
238-
delay(5000)
239-
}
233+
var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
240234

241235
// We wait for non-null, so this only happens on cancellation.
242236
val joinLink = status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302296
}
303297
// Continue once the client is present.
304298
handle.onClientPresenceChanged.advise(lifetime) {
299+
logger.info("${workspace.ideName} client to ${workspace.hostname} presence: ${handle.clientPresent}")
305300
if (handle.clientPresent && continuation.isActive) {
306301
continuation.resume(true)
307302
}
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437432
}
438433

439434
/**
440-
* Ensure the backend is started. Status and/or links may be null if the
441-
* backend has not started.
435+
* Ensure the backend is started. It will not return until a join link is
436+
* received or the lifetime expires.
442437
*/
443438
private suspend fun ensureIDEBackend(
444439
accessor: HighLevelHostAccessor,
@@ -449,41 +444,67 @@ class CoderRemoteConnectionHandle {
449444
lifetime: LifetimeDefinition,
450445
currentStatus: UnattendedHostStatus?,
451446
): UnattendedHostStatus? {
452-
val details = "${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
453-
return try {
454-
if (currentStatus?.appPid != null &&
455-
!currentStatus.joinLink.isNullOrBlank() &&
456-
accessor.isPidAlive(currentStatus.appPid.toInt())
457-
) {
458-
// If the PID is alive, assume the join link we have is still
459-
// valid. The join link seems to change even if it is the same
460-
// backend running, so if we always fetched the link the client
461-
// would relaunch over and over.
447+
val details = "$${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
448+
449+
// Check if the current IDE is alive.
450+
if (currentStatus != null) {
451+
val isAlive = try {
452+
val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
453+
logger.info("${workspace.ideName} status: pid=${currentStatus.appPid}, alive=$isAlive")
454+
isAlive
455+
} catch (ex: Exception) {
456+
logger.info("Failed to check if ${workspace.ideName} is alive on $details: pid=${currentStatus.appPid}", ex)
457+
false
458+
}
459+
if (isAlive) {
460+
// Use the current status and join link.
462461
return currentStatus
462+
} else {
463+
logger.info("Relaunching ${workspace.ideName} since it is not alive...")
463464
}
465+
} else {
466+
logger.info("Launching ${workspace.ideName} for the first time on ${workspace.hostname}...")
467+
}
464468

465-
// See if there is already a backend running. Weirdly, there is
466-
// always a PID, even if there is no backend running, and
467-
// backendUnresponsive is always false, but the links are null so
468-
// hopefully that is an accurate indicator that the IDE is up.
469-
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470-
if (!status.joinLink.isNullOrBlank()) {
471-
logger.info("Found existing ${workspace.ideName} backend on $details")
472-
return status
473-
}
469+
// If the PID is not alive, spawn a new backend. This may not be
470+
// idempotent, so only call if we are really sure we need to.
471+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
474472

475-
// Otherwise, spawn a new backend. This does not seem to spawn a
476-
// second backend if one is already running, yet it does somehow
477-
// cause a second client to launch. So only run this if we are
478-
// really sure we have to launch a new backend.
479-
logger.info("Starting ${workspace.ideName} backend on $details")
480-
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481-
// Get the newly spawned PID and join link.
482-
return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483-
} catch (ex: Exception) {
484-
logger.info("Failed to get ${workspace.ideName} status from $details", ex)
485-
currentStatus
473+
// Get the newly spawned PID and join link.
474+
var attempts = 0
475+
val maxAttempts = 6
476+
val wait = TimeUnit.SECONDS.toMillis(5)
477+
while (lifetime.status == LifetimeStatus.Alive) {
478+
try {
479+
attempts++
480+
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
481+
if (!status.joinLink.isNullOrBlank()) {
482+
logger.info("Found join link for ${workspace.ideName}; proceeding to connect: pid=${status.appPid}")
483+
return status
484+
}
485+
// If we did not get a join link, see if the IDE is alive in
486+
// case it died and we need to respawn.
487+
val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
488+
logger.info("${workspace.ideName} status: pid=${status.appPid}, alive=$isAlive, unresponsive=${status.backendUnresponsive}, attempt=$attempts")
489+
// It is not clear whether the PID can be trusted because we get
490+
// one even when there is no backend at all. For now give it
491+
// some time and if it is still dead, only then try to respawn.
492+
if (!isAlive && attempts >= maxAttempts) {
493+
logger.info("${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again")
494+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
495+
attempts = 0
496+
} else {
497+
logger.info("No join link found in status; waiting $wait ms to try again")
498+
}
499+
} catch (ex: Exception) {
500+
logger.info("Failed to get ${workspace.ideName} status from $details; waiting $wait ms to try again", ex)
501+
}
502+
delay(wait)
486503
}
504+
505+
// This means the lifetime is no longer alive.
506+
logger.info("Connection to ${workspace.ideName} on $details aborted by user")
507+
return null
487508
}
488509

489510
companion object {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy