upload/koji: add a retrying mechanism for CGImport

CGImport quite often fails with the following error:
Fault(1000): File size 735051776 for Fedora-IoT-38.raw.xz (expected 738785372)
doesn't match. Corrupted upload?

When I inspect the file manually, everything seems fine, though.
I believe that this because of NFS inconsistency when multiple DNS-balanced
kojihubs are used in the setup (which is what Fedora uses). The addded
loop implements a retrying mechanism for the CGImport call to try again
whenever we see this issue.

Note that this isn't caught by other HTTP retrying mechanism because a failed
XMLRPC call returns code 200.

Signed-off-by: Ondřej Budai <ondrej@budai.cz>
This commit is contained in:
Ondřej Budai 2023-04-19 13:58:47 +02:00 committed by Tomáš Hozza
parent ce5e41f980
commit fdc4f54be8

View file

@ -3,10 +3,6 @@ package koji
import (
"bytes"
"context"
"net"
"strings"
"time"
// koji uses MD5 hashes
/* #nosec G501 */
"crypto/md5"
@ -17,15 +13,18 @@ import (
"fmt"
"hash/adler32"
"io"
"net"
"net/http"
"net/url"
"os"
"strings"
"time"
rh "github.com/hashicorp/go-retryablehttp"
"github.com/kolo/xmlrpc"
"github.com/sirupsen/logrus"
"github.com/ubccr/kerby/khttp"
rh "github.com/hashicorp/go-retryablehttp"
"github.com/osbuild/osbuild-composer/internal/rpmmd"
)
@ -284,13 +283,32 @@ func (k *Koji) CGImport(build ImageBuild, buildRoots []BuildRoot, images []Image
return nil, err
}
var result CGImportResult
err = k.xmlrpc.Call("CGImport", []interface{}{string(metadata), directory, token}, &result)
if err != nil {
return nil, err
const retryCount = 10
const retryDelay = time.Second
for attempt := 0; attempt < retryCount; attempt += 1 {
var result CGImportResult
err = k.xmlrpc.Call("CGImport", []interface{}{string(metadata), directory, token}, &result)
if err != nil {
// Retry when the error mentions a corrupted upload. It's usually
// just because of NFS inconsistency when the kojihub has multiple
// replicas.
if strings.Contains(err.Error(), "Corrupted upload") {
time.Sleep(retryDelay)
continue
}
// Fail immediately on other errors, they are probably legitimate
return nil, err
}
logrus.Infof("CGImport succeeded after %d attempts", attempt+1)
return &result, nil
}
return &result, nil
return nil, fmt.Errorf("failed to import a build after %d attempts: %w", retryCount, err)
}
// uploadChunk uploads a byte slice to a given filepath/filname at a given offset