Skip to content

Commit

Permalink
Merge pull request #13 from cedana/fix/retryable-setups
Browse files Browse the repository at this point in the history
CED-63: basic retry command added
  • Loading branch information
nravic authored Aug 7, 2023
2 parents 04c0553 + 7a4bbeb commit 161b801
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 51 deletions.
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"restructuredtext.preview.docutils.disabled": true
}
90 changes: 86 additions & 4 deletions cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,50 @@ var runCmd = &cobra.Command{
},
}

var retryCmd = &cobra.Command{
Use: "retry",
Short: "Retry a failed setup from jobID [job-id]",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
r := buildRunner()
defer r.cleanRunner()

jobID := args[0]

// check db for existing job
job := r.db.GetJob(jobID)
if job == nil {
return fmt.Errorf("could not find job with id %s", jobID)
}

r.job = job
// pull worker ID out

attachedInstanceIDs, err := job.GetInstanceIds()
if err != nil {
return err
}

var worker cedana.Instance

// this should be way better. ideally the spun up instances themselves
// should have
for _, i := range attachedInstanceIDs {
instance := r.db.GetInstanceByCedanaID(i.InstanceID)
if instance.Tag == "worker" {
worker = instance
}
}

err = r.retryJob(worker)
if err != nil {
return err
}

return nil
},
}

var showInstancesCmd = &cobra.Command{
Use: "show",
Short: "Show instances launched with Cedana",
Expand Down Expand Up @@ -164,21 +208,21 @@ var destroyCmd = &cobra.Command{
// TODO NR: assuming for now user just wants to destroy one instance, have to expand
id := args[0]

instance := r.db.GetInstanceByProviderId(id)
if instance == nil {
instance := r.db.GetInstanceByCedanaID(id)
if instance.ID == 0 {
return fmt.Errorf("could not find instance with id %s", id)
}

switch instance.Provider {
case "aws":
aws := r.providers["aws"]
err := aws.DestroyInstance(*instance)
err := aws.DestroyInstance(instance)
if err != nil {
return err
}
case "paperspace":
paperspace := r.providers["paperspace"]
err := paperspace.DestroyInstance(*instance)
err := paperspace.DestroyInstance(instance)
if err != nil {
return err
}
Expand Down Expand Up @@ -239,6 +283,43 @@ var restoreCmd = &cobra.Command{
},
}

// very basic retry of a failed setup, allows users to play with the yaml without
// needing to tear down and redeploy
func (r *Runner) retryJob(worker cedana.Instance) error {
is := BuildInstanceSetup(worker, *r.job)

err := is.ClientSetup(true)

if err != nil {
r.logger.Info().Msgf("could not set up client, retry using `./cedana-cli setup -i %s -j %s`", worker.CedanaID, "yourjob.yml")
r.db.UpdateJobState(r.job, types.JobStateSetupFailed)
return err
}

r.db.UpdateJobState(r.job, types.JobStateRunning)

// job is orchestrated by a local worker
orch, err := r.db.CreateInstance(&cedana.Instance{
Provider: "local",
IPAddress: "0.0.0.0",
Tag: "orchestrator",
State: "running",
AllocatedID: "local",
})

r.db.AttachInstanceToJob(r.job, *orch)

if err != nil {
return err
}

cd := NewCLIDaemon()
cd.Start(orch.CedanaID, r.job.JobID, worker.CedanaID)

return nil

}

// restoreJob manually restores the most recent checkpoint onto a new instance
func (r *Runner) restoreJob(jobID string) error {
// validate that job exists
Expand Down Expand Up @@ -647,4 +728,5 @@ func init() {
rootCmd.AddCommand(showInstancesCmd)
rootCmd.AddCommand(destroyAllCmd)
rootCmd.AddCommand(restoreCmd)
rootCmd.AddCommand(retryCmd)
}
40 changes: 7 additions & 33 deletions cmd/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,13 @@ type InstanceSetup struct {
job *cedana.Job
}

var userOnly bool
var jobFile string
var instanceId string

var SetupCmd = &cobra.Command{
Use: "setup",
Short: "Manually set up a launched instance with Cedana defaults and user-provided scripts",
Long: "Provide commands to run on the remote instance in user_commands.yaml in the ~/.cedana config folder",
Hidden: true,
Use: "setup",
Short: "Manually set up a launched instance with Cedana defaults and user-provided scripts",
Long: "Provide commands to run on the remote instance in user_commands.yaml in the ~/.cedana config folder",
RunE: func(cmd *cobra.Command, args []string) error {
// ClientSetup takes a SpotInstance as input - match against the state file
db := db.NewDB()
Expand All @@ -44,27 +42,23 @@ var SetupCmd = &cobra.Command{
l.Fatal().Err(err).Msg("could not set up cedana job")
}

instance := db.GetInstanceByProviderId(instanceId)
instance := db.GetInstanceByCedanaID(instanceId)
if instance.IPAddress == "" {
return fmt.Errorf("could not find instance with id %s", instanceId)
}
cfg, err := utils.InitCedanaConfig()
if err != nil {
return fmt.Errorf("could not load spot config %v", err)
return fmt.Errorf("could not load config %v", err)
}

is := InstanceSetup{
logger: &l,
cfg: cfg,
instance: *instance,
instance: instance,
jobFile: jobFile,
}

if userOnly {
is.execUserCommands()
} else {
is.ClientSetup(true)
}
is.ClientSetup(true)
return nil
},
}
Expand Down Expand Up @@ -150,25 +144,6 @@ func (is *InstanceSetup) CreateConn() (*ssh.Client, error) {
return conn, nil
}

func (is *InstanceSetup) execUserCommands() error {
// only runs user commands
conn, err := is.CreateConn()
if err != nil {
return err
}
defer conn.Close()

var cmds []string
is.buildUserSetupCommands(&cmds)

err = is.execCommands(cmds, conn)
if err != nil {
is.logger.Fatal().Err(err).Msg("error executing commands")
}

return nil
}

// Runs cedana-specific and user-specified instantiation scripts for a client instance in an SSH session.
func (is *InstanceSetup) ClientSetup(runTask bool) error {

Expand Down Expand Up @@ -536,7 +511,6 @@ func (is *InstanceSetup) scpWorkDir(workDirPath string) error {

func init() {
rootCmd.AddCommand(SetupCmd)
SetupCmd.Flags().BoolVar(&userOnly, "user", false, "run only user-specificed commands on remote instance")
SetupCmd.Flags().StringVarP(&jobFile, "job", "j", "", "job file to use for setup")
SetupCmd.Flags().StringVarP(&instanceId, "instance", "i", "", "provider instance id to setup")
cobra.MarkFlagRequired(SetupCmd.Flags(), "job")
Expand Down
1 change: 1 addition & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
llama.cpp
llama_models
workdir
20 changes: 11 additions & 9 deletions examples/trill.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
instance_specs:
memory_gb: 30
cpu_cores: 4
max_price_usd_hour: 1
work_dir: '/home/USER/work_dir'
memory_gb: 30
cpu_cores: 4
max_price_usd_hour: 1
work_dir: 'work_dir'

setup:
run:
- 'sudo apt-get update && sudo apt-get install -y python3 python3-venv'
- 'docker build -t trill .'
run:
- 'sudo apt-get update && sudo apt-get install -y python3 python3-venv'
- 'cd work_dir/TRILL && sudo docker build -t trill .'


task:
run:
- 'echo "hello world" ; docker run trill example_1 0 -h'
run:
- 'echo "hello world" ; sudo docker run trill example_1 0 -h'
5 changes: 0 additions & 5 deletions examples/workdir/simple_loop.sh

This file was deleted.

0 comments on commit 161b801

Please sign in to comment.