package gobridge import ( "context" "fmt" "net" "os" "os/exec" "sync" "sync/atomic" "time" ) type worker struct { cfg *poolConfig id int sockPath string conns chan net.Conn stopped atomic.Bool stopCh chan struct{} stopOnce sync.Once mu sync.Mutex // 保护 cmd cmd *exec.Cmd } func newWorker(cfg *poolConfig, id int) (*worker, error) { w := &worker{ cfg: cfg, id: id, conns: make(chan net.Conn, cfg.maxConnsPerWorker), stopCh: make(chan struct{}), } if err := w.start(); err != nil { return nil, err } go w.monitor() return w, nil } // start 启动 Python 子进程并预建连接池,返回当前 cmd 供 monitor() 等待 func (w *worker) start() error { sockPath := fmt.Sprintf("%s/gobridge-%d-%d.sock", w.cfg.socketDir, os.Getpid(), w.id) os.Remove(sockPath) w.sockPath = sockPath cmd := exec.Command(w.cfg.pythonExe, w.cfg.scriptArgs...) cmd.Dir = w.cfg.workDir cmd.Env = append(os.Environ(), w.cfg.env...) cmd.Env = append(cmd.Env, "GOBRIDGE_SOCKET_PATH="+sockPath) if w.cfg.stdout != nil { cmd.Stdout = w.cfg.stdout } else { cmd.Stdout = os.Stdout } if w.cfg.stderr != nil { cmd.Stderr = w.cfg.stderr } else { cmd.Stderr = os.Stderr } if err := cmd.Start(); err != nil { return fmt.Errorf("start python worker: %w", err) } w.mu.Lock() w.cmd = cmd w.mu.Unlock() // 等待 socket 文件出现(最多 10 秒) deadline := time.Now().Add(10 * time.Second) for time.Now().Before(deadline) { if _, err := os.Stat(sockPath); err == nil { break } select { case <-w.stopCh: cmd.Process.Kill() cmd.Wait() return fmt.Errorf("stopped while waiting for socket") case <-time.After(50 * time.Millisecond): } } if _, err := os.Stat(sockPath); err != nil { cmd.Process.Kill() cmd.Wait() return fmt.Errorf("worker socket did not appear: %s", sockPath) } // 预建连接 for i := 0; i < w.cfg.maxConnsPerWorker; i++ { conn, err := net.DialTimeout("unix", sockPath, 5*time.Second) if err != nil { cmd.Process.Kill() cmd.Wait() for len(w.conns) > 0 { (<-w.conns).Close() } return fmt.Errorf("connect to worker: %w", err) } w.conns <- conn } return nil } // monitor 监控 Python 进程,崩溃时自动重启(指数退避,最长 30s) func (w *worker) monitor() { for { // 等待当前进程退出 w.mu.Lock() cmd := w.cmd w.mu.Unlock() cmd.Wait() if w.stopped.Load() { return } os.Remove(w.sockPath) // 排空失效连接 for len(w.conns) > 0 { (<-w.conns).Close() } // 指数退避重启 for attempt := 0; !w.stopped.Load(); attempt++ { if err := w.start(); err == nil { break } delay := time.Duration(min(1< 0 { (<-w.conns).Close() } os.Remove(w.sockPath) }