upload/koji: add a retrying mechanism for CGImport

CGImport quite often fails with the following error:
Fault(1000): File size 735051776 for Fedora-IoT-38.raw.xz (expected 738785372)
doesn't match. Corrupted upload?

When I inspect the file manually, everything seems fine, though.
I believe that this because of NFS inconsistency when multiple DNS-balanced
kojihubs are used in the setup (which is what Fedora uses). The addded
loop implements a retrying mechanism for the CGImport call to try again
whenever we see this issue.

Note that this isn't caught by other HTTP retrying mechanism because a failed
XMLRPC call returns code 200.

Signed-off-by: Ondřej Budai <ondrej@budai.cz>
This commit is contained in:
Ondřej Budai 2023-04-19 13:58:47 +02:00 committed by Tomáš Hozza
parent ce5e41f980
commit fdc4f54be8

View file

@ -3,10 +3,6 @@ package koji
import ( import (
"bytes" "bytes"
"context" "context"
"net"
"strings"
"time"
// koji uses MD5 hashes // koji uses MD5 hashes
/* #nosec G501 */ /* #nosec G501 */
"crypto/md5" "crypto/md5"
@ -17,15 +13,18 @@ import (
"fmt" "fmt"
"hash/adler32" "hash/adler32"
"io" "io"
"net"
"net/http" "net/http"
"net/url" "net/url"
"os" "os"
"strings"
"time"
rh "github.com/hashicorp/go-retryablehttp"
"github.com/kolo/xmlrpc" "github.com/kolo/xmlrpc"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/ubccr/kerby/khttp" "github.com/ubccr/kerby/khttp"
rh "github.com/hashicorp/go-retryablehttp"
"github.com/osbuild/osbuild-composer/internal/rpmmd" "github.com/osbuild/osbuild-composer/internal/rpmmd"
) )
@ -284,13 +283,32 @@ func (k *Koji) CGImport(build ImageBuild, buildRoots []BuildRoot, images []Image
return nil, err return nil, err
} }
var result CGImportResult const retryCount = 10
err = k.xmlrpc.Call("CGImport", []interface{}{string(metadata), directory, token}, &result) const retryDelay = time.Second
if err != nil {
return nil, err for attempt := 0; attempt < retryCount; attempt += 1 {
var result CGImportResult
err = k.xmlrpc.Call("CGImport", []interface{}{string(metadata), directory, token}, &result)
if err != nil {
// Retry when the error mentions a corrupted upload. It's usually
// just because of NFS inconsistency when the kojihub has multiple
// replicas.
if strings.Contains(err.Error(), "Corrupted upload") {
time.Sleep(retryDelay)
continue
}
// Fail immediately on other errors, they are probably legitimate
return nil, err
}
logrus.Infof("CGImport succeeded after %d attempts", attempt+1)
return &result, nil
} }
return &result, nil return nil, fmt.Errorf("failed to import a build after %d attempts: %w", retryCount, err)
} }
// uploadChunk uploads a byte slice to a given filepath/filname at a given offset // uploadChunk uploads a byte slice to a given filepath/filname at a given offset