Fix out of memory in hash computation

2021-06-07 12:07:56 +01:00 · 2021-06-07 12:07:56 +01:00 · 02e8dcfe9c
commit 02e8dcfe9c
parent 242fd828aa
10 changed files with 151 additions and 91 deletions
--- a/lib/fingerprints.js
+++ b/lib/fingerprints.js
@ -36,9 +36,9 @@ function computeFirstMod() {
 * the hashes of the lines near the end of the file.
 *
 * @param callback function that is called with the line number (1-based) and hash for every line
- * @param input The file's contents
+ * @param filepath The path to the file to hash
 */
-function hash(callback, input) {
+async function hash(callback, filepath) {
    // A rolling view in to the input
    const window = Array(BLOCK_SIZE).fill(0);
    // If the character in the window is the start of a new line
@ -82,12 +82,11 @@ function hash(callback, input) {
    // as we go. Once we reach a point in the window again then we've processed
    // BLOCK_SIZE characters and if the last character at this point in the window
    // was the start of a line then we should output the hash for that line.
-    for (let i = 0, len = input.length; i <= len; i++) {
-        let current = i === len ? 65535 : input.charCodeAt(i);
+    const processCharacter = function (current) {
        // skip tabs, spaces, and line feeds that come directly after a carriage return
        if (current === space || current === tab || (prevCR && current === lf)) {
            prevCR = false;
-            continue;
+            return;
        }
        // replace CR with LF
        if (current === cr) {
@ -109,7 +108,18 @@ function hash(callback, input) {
            lineStart = true;
        }
        updateHash(current);
-    }
+    };
+    await new Promise((fulfill) => {
+        const readStream = fs.createReadStream(filepath, "utf8");
+        readStream.on("close", fulfill);
+        readStream.on("end", () => {
+            processCharacter(65535);
+        });
+        readStream.on("data", (data) => {
+            for (let i = 0; i < data.length; ++i)
+                processCharacter(data.charCodeAt(i));
+        });
+    });
    // Flush the remaining lines
    for (let i = 0; i < BLOCK_SIZE; i++) {
        if (lineNumbers[index] !== -1) {
@ -206,8 +216,8 @@ function resolveUriToFile(location, artifacts, checkoutPath, logger) {
 exports.resolveUriToFile = resolveUriToFile;
 // Compute fingerprints for results in the given sarif file
 // and return an updated sarif file contents.
-function addFingerprints(sarifContents, checkoutPath, logger) {
-    var _a, _b;
+async function addFingerprints(sarifContents, checkoutPath, logger) {
+    var _a, _b, _c, _d, _e;
    const sarif = JSON.parse(sarifContents);
    // Gather together results for the same file and construct
    // callbacks to accept hashes for that file and update the location
@ -222,6 +232,11 @@ function addFingerprints(sarifContents, checkoutPath, logger) {
                logger.debug(`Unable to compute fingerprint for invalid location: ${JSON.stringify(primaryLocation)}`);
                continue;
            }
+            if (typeof ((_e = (_d = (_c = primaryLocation) === null || _c === void 0 ? void 0 : _c.physicalLocation) === null || _d === void 0 ? void 0 : _d.region) === null || _e === void 0 ? void 0 : _e.startLine) ===
+                "undefined") {
+                // Locations without a line number are unlikely to be source files
+                continue;
+            }
            const filepath = resolveUriToFile(primaryLocation.physicalLocation.artifactLocation, artifacts, checkoutPath, logger);
            if (!filepath) {
                continue;
@ -240,8 +255,7 @@ function addFingerprints(sarifContents, checkoutPath, logger) {
                c(lineNumber, hashValue);
            }
        };
-        const fileContents = fs.readFileSync(filepath).toString();
-        hash(teeCallback, fileContents);
+        await hash(teeCallback, filepath);
    }
    return JSON.stringify(sarif);
 }
--- a/lib/fingerprints.js.map
+++ b/lib/fingerprints.js.map
--- a/lib/fingerprints.test.js
+++ b/lib/fingerprints.test.js
@ -16,28 +16,33 @@ const ava_1 = __importDefault(require("ava"));
 const fingerprints = __importStar(require("./fingerprints"));
 const logging_1 = require("./logging");
 const testing_utils_1 = require("./testing-utils");
+const util = __importStar(require("./util"));
 testing_utils_1.setupTests(ava_1.default);
-function testHash(t, input, expectedHashes) {
-    let index = 0;
-    const callback = function (lineNumber, hash) {
-        t.is(lineNumber, index + 1);
-        t.is(hash, expectedHashes[index]);
-        index++;
-    };
-    fingerprints.hash(callback, input);
-    t.is(index, input.split(/\r\n|\r|\n/).length);
+async function testHash(t, input, expectedHashes) {
+    await util.withTmpDir(async (tmpDir) => {
+        const tmpFile = path.resolve(tmpDir, "testfile");
+        fs.writeFileSync(tmpFile, input);
+        let index = 0;
+        const callback = function (lineNumber, hash) {
+            t.is(lineNumber, index + 1);
+            t.is(hash, expectedHashes[index]);
+            index++;
+        };
+        await fingerprints.hash(callback, tmpFile);
+        t.is(index, input.split(/\r\n|\r|\n/).length);
+    });
 }
-ava_1.default("hash", (t) => {
+ava_1.default("hash", async (t) => {
    // Try empty file
-    testHash(t, "", ["c129715d7a2bc9a3:1"]);
+    await testHash(t, "", ["c129715d7a2bc9a3:1"]);
    // Try various combinations of newline characters
-    testHash(t, " a\nb\n  \t\tc\n d", [
+    await testHash(t, " a\nb\n  \t\tc\n d", [
        "271789c17abda88f:1",
        "54703d4cd895b18:1",
        "180aee12dab6264:1",
        "a23a3dc5e078b07b:1",
    ]);
-    testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End", [
+    await testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End", [
        "8b7cf3e952e7aeb2:1",
        "b1ae1287ec4718d9:1",
        "bff680108adb0fcc:1",
@ -45,7 +50,7 @@ ava_1.default("hash", (t) => {
        "b86d3392aea1be30:1",
        "e6ceba753e1a442:1",
    ]);
-    testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End\n", [
+    await testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End\n", [
        "e9496ae3ebfced30:1",
        "fb7c023a8b9ccb3f:1",
        "ce8ba1a563dcdaca:1",
@ -54,7 +59,7 @@ ava_1.default("hash", (t) => {
        "c8e28b0b4002a3a0:1",
        "c129715d7a2bc9a3:1",
    ]);
-    testHash(t, " hello; \t\nworld!!!\r\r\r  \t\tGreetings\r End\r", [
+    await testHash(t, " hello; \t\nworld!!!\r\r\r  \t\tGreetings\r End\r", [
        "e9496ae3ebfced30:1",
        "fb7c023a8b9ccb3f:1",
        "ce8ba1a563dcdaca:1",
@ -63,7 +68,7 @@ ava_1.default("hash", (t) => {
        "c8e28b0b4002a3a0:1",
        "c129715d7a2bc9a3:1",
    ]);
-    testHash(t, " hello; \t\r\nworld!!!\r\n\r\n\r\n  \t\tGreetings\r\n End\r\n", [
+    await testHash(t, " hello; \t\r\nworld!!!\r\n\r\n\r\n  \t\tGreetings\r\n End\r\n", [
        "e9496ae3ebfced30:1",
        "fb7c023a8b9ccb3f:1",
        "ce8ba1a563dcdaca:1",
@ -72,7 +77,7 @@ ava_1.default("hash", (t) => {
        "c8e28b0b4002a3a0:1",
        "c129715d7a2bc9a3:1",
    ]);
-    testHash(t, " hello; \t\nworld!!!\r\n\n\r  \t\tGreetings\r End\r\n", [
+    await testHash(t, " hello; \t\nworld!!!\r\n\n\r  \t\tGreetings\r End\r\n", [
        "e9496ae3ebfced30:1",
        "fb7c023a8b9ccb3f:1",
        "ce8ba1a563dcdaca:1",
@ -82,7 +87,7 @@ ava_1.default("hash", (t) => {
        "c129715d7a2bc9a3:1",
    ]);
    // Try repeating line that will generate identical hashes
-    testHash(t, "Lorem ipsum dolor sit amet.\n".repeat(10), [
+    await testHash(t, "Lorem ipsum dolor sit amet.\n".repeat(10), [
        "a7f2ff13bc495cf2:1",
        "a7f2ff13bc495cf2:2",
        "a7f2ff13bc495cf2:3",
@ -95,7 +100,7 @@ ava_1.default("hash", (t) => {
        "cc97dc7b1d7d8f7b:1",
        "c129715d7a2bc9a3:1",
    ]);
-    testHash(t, "x = 2\nx = 1\nprint(x)\nx = 3\nprint(x)\nx = 4\nprint(x)\n", [
+    await testHash(t, "x = 2\nx = 1\nprint(x)\nx = 3\nprint(x)\nx = 4\nprint(x)\n", [
        "e54938cc54b302f1:1",
        "bb609acbe9138d60:1",
        "1131fd5871777f34:1",
@ -150,7 +155,7 @@ ava_1.default("resolveUriToFile", (t) => {
    t.is(testResolveUriToFile(dirpath, undefined, []), undefined);
    t.is(testResolveUriToFile(`file://${dirpath}`, undefined, []), undefined);
 });
-ava_1.default("addFingerprints", (t) => {
+ava_1.default("addFingerprints", async (t) => {
    // Run an end-to-end test on a test file
    let input = fs
        .readFileSync(`${__dirname}/../src/testdata/fingerprinting.input.sarif`)
@ -163,9 +168,9 @@ ava_1.default("addFingerprints", (t) => {
    expected = JSON.stringify(JSON.parse(expected));
    // The URIs in the SARIF files resolve to files in the testdata directory
    const checkoutPath = path.normalize(`${__dirname}/../src/testdata`);
-    t.deepEqual(fingerprints.addFingerprints(input, checkoutPath, logging_1.getRunnerLogger(true)), expected);
+    t.deepEqual(await fingerprints.addFingerprints(input, checkoutPath, logging_1.getRunnerLogger(true)), expected);
 });
-ava_1.default("missingRegions", (t) => {
+ava_1.default("missingRegions", async (t) => {
    // Run an end-to-end test on a test file
    let input = fs
        .readFileSync(`${__dirname}/../src/testdata/fingerprinting2.input.sarif`)
@ -178,6 +183,6 @@ ava_1.default("missingRegions", (t) => {
    expected = JSON.stringify(JSON.parse(expected));
    // The URIs in the SARIF files resolve to files in the testdata directory
    const checkoutPath = path.normalize(`${__dirname}/../src/testdata`);
-    t.deepEqual(fingerprints.addFingerprints(input, checkoutPath, logging_1.getRunnerLogger(true)), expected);
+    t.deepEqual(await fingerprints.addFingerprints(input, checkoutPath, logging_1.getRunnerLogger(true)), expected);
 });
 //# sourceMappingURL=fingerprints.test.js.map
--- a/lib/fingerprints.test.js.map
+++ b/lib/fingerprints.test.js.map
--- a/lib/upload-lib.js
+++ b/lib/upload-lib.js
@ -243,7 +243,7 @@ async function uploadFiles(sarifFiles, repositoryNwo, commitOid, ref, analysisKe
        validateSarifFileSchema(file, logger);
    }
    let sarifPayload = combineSarifFiles(sarifFiles);
-    sarifPayload = fingerprints.addFingerprints(sarifPayload, checkoutPath, logger);
+    sarifPayload = await fingerprints.addFingerprints(sarifPayload, checkoutPath, logger);
    sarifPayload = populateRunAutomationDetails(sarifPayload, category, analysisKey, environment);
    const zippedSarif = zlib_1.default.gzipSync(sarifPayload).toString("base64");
    const checkoutURI = file_url_1.default(checkoutPath);
--- a/lib/upload-lib.js.map
+++ b/lib/upload-lib.js.map
--- a/src/fingerprints.test.ts
+++ b/src/fingerprints.test.ts
@ -7,32 +7,41 @@ import test from "ava";
 import * as fingerprints from "./fingerprints";
 import { getRunnerLogger } from "./logging";
 import { setupTests } from "./testing-utils";
+import * as util from "./util";

 setupTests(test);

-function testHash(t: ava.Assertions, input: string, expectedHashes: string[]) {
-  let index = 0;
-  const callback = function (lineNumber: number, hash: string) {
-    t.is(lineNumber, index + 1);
-    t.is(hash, expectedHashes[index]);
-    index++;
-  };
-  fingerprints.hash(callback, input);
-  t.is(index, input.split(/\r\n|\r|\n/).length);
+async function testHash(
+  t: ava.Assertions,
+  input: string,
+  expectedHashes: string[]
+) {
+  await util.withTmpDir(async (tmpDir) => {
+    const tmpFile = path.resolve(tmpDir, "testfile");
+    fs.writeFileSync(tmpFile, input);
+    let index = 0;
+    const callback = function (lineNumber: number, hash: string) {
+      t.is(lineNumber, index + 1);
+      t.is(hash, expectedHashes[index]);
+      index++;
+    };
+    await fingerprints.hash(callback, tmpFile);
+    t.is(index, input.split(/\r\n|\r|\n/).length);
+  });
 }

-test("hash", (t: ava.Assertions) => {
+test("hash", async (t: ava.Assertions) => {
  // Try empty file
-  testHash(t, "", ["c129715d7a2bc9a3:1"]);
+  await testHash(t, "", ["c129715d7a2bc9a3:1"]);

  // Try various combinations of newline characters
-  testHash(t, " a\nb\n  \t\tc\n d", [
+  await testHash(t, " a\nb\n  \t\tc\n d", [
    "271789c17abda88f:1",
    "54703d4cd895b18:1",
    "180aee12dab6264:1",
    "a23a3dc5e078b07b:1",
  ]);
-  testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End", [
+  await testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End", [
    "8b7cf3e952e7aeb2:1",
    "b1ae1287ec4718d9:1",
    "bff680108adb0fcc:1",
@ -40,7 +49,7 @@ test("hash", (t: ava.Assertions) => {
    "b86d3392aea1be30:1",
    "e6ceba753e1a442:1",
  ]);
-  testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End\n", [
+  await testHash(t, " hello; \t\nworld!!!\n\n\n  \t\tGreetings\n End\n", [
    "e9496ae3ebfced30:1",
    "fb7c023a8b9ccb3f:1",
    "ce8ba1a563dcdaca:1",
@ -49,7 +58,7 @@ test("hash", (t: ava.Assertions) => {
    "c8e28b0b4002a3a0:1",
    "c129715d7a2bc9a3:1",
  ]);
-  testHash(t, " hello; \t\nworld!!!\r\r\r  \t\tGreetings\r End\r", [
+  await testHash(t, " hello; \t\nworld!!!\r\r\r  \t\tGreetings\r End\r", [
    "e9496ae3ebfced30:1",
    "fb7c023a8b9ccb3f:1",
    "ce8ba1a563dcdaca:1",
@ -58,16 +67,20 @@ test("hash", (t: ava.Assertions) => {
    "c8e28b0b4002a3a0:1",
    "c129715d7a2bc9a3:1",
  ]);
-  testHash(t, " hello; \t\r\nworld!!!\r\n\r\n\r\n  \t\tGreetings\r\n End\r\n", [
-    "e9496ae3ebfced30:1",
-    "fb7c023a8b9ccb3f:1",
-    "ce8ba1a563dcdaca:1",
-    "e20e36e16fcb0cc8:1",
-    "b3edc88f2938467e:1",
-    "c8e28b0b4002a3a0:1",
-    "c129715d7a2bc9a3:1",
-  ]);
-  testHash(t, " hello; \t\nworld!!!\r\n\n\r  \t\tGreetings\r End\r\n", [
+  await testHash(
+    t,
+    " hello; \t\r\nworld!!!\r\n\r\n\r\n  \t\tGreetings\r\n End\r\n",
+    [
+      "e9496ae3ebfced30:1",
+      "fb7c023a8b9ccb3f:1",
+      "ce8ba1a563dcdaca:1",
+      "e20e36e16fcb0cc8:1",
+      "b3edc88f2938467e:1",
+      "c8e28b0b4002a3a0:1",
+      "c129715d7a2bc9a3:1",
+    ]
+  );
+  await testHash(t, " hello; \t\nworld!!!\r\n\n\r  \t\tGreetings\r End\r\n", [
    "e9496ae3ebfced30:1",
    "fb7c023a8b9ccb3f:1",
    "ce8ba1a563dcdaca:1",
@ -78,7 +91,7 @@ test("hash", (t: ava.Assertions) => {
  ]);

  // Try repeating line that will generate identical hashes
-  testHash(t, "Lorem ipsum dolor sit amet.\n".repeat(10), [
+  await testHash(t, "Lorem ipsum dolor sit amet.\n".repeat(10), [
    "a7f2ff13bc495cf2:1",
    "a7f2ff13bc495cf2:2",
    "a7f2ff13bc495cf2:3",
@ -92,16 +105,20 @@ test("hash", (t: ava.Assertions) => {
    "c129715d7a2bc9a3:1",
  ]);

-  testHash(t, "x = 2\nx = 1\nprint(x)\nx = 3\nprint(x)\nx = 4\nprint(x)\n", [
-    "e54938cc54b302f1:1",
-    "bb609acbe9138d60:1",
-    "1131fd5871777f34:1",
-    "5c482a0f8b35ea28:1",
-    "54517377da7028d2:1",
-    "2c644846cb18d53e:1",
-    "f1b89f20de0d133:1",
-    "c129715d7a2bc9a3:1",
-  ]);
+  await testHash(
+    t,
+    "x = 2\nx = 1\nprint(x)\nx = 3\nprint(x)\nx = 4\nprint(x)\n",
+    [
+      "e54938cc54b302f1:1",
+      "bb609acbe9138d60:1",
+      "1131fd5871777f34:1",
+      "5c482a0f8b35ea28:1",
+      "54517377da7028d2:1",
+      "2c644846cb18d53e:1",
+      "f1b89f20de0d133:1",
+      "c129715d7a2bc9a3:1",
+    ]
+  );
 });

 function testResolveUriToFile(uri: any, index: any, artifactsURIs: any[]) {
@ -170,7 +187,7 @@ test("resolveUriToFile", (t) => {
  t.is(testResolveUriToFile(`file://${dirpath}`, undefined, []), undefined);
 });

-test("addFingerprints", (t) => {
+test("addFingerprints", async (t) => {
  // Run an end-to-end test on a test file
  let input = fs
    .readFileSync(`${__dirname}/../src/testdata/fingerprinting.input.sarif`)
@ -187,12 +204,16 @@ test("addFingerprints", (t) => {
  const checkoutPath = path.normalize(`${__dirname}/../src/testdata`);

  t.deepEqual(
-    fingerprints.addFingerprints(input, checkoutPath, getRunnerLogger(true)),
+    await fingerprints.addFingerprints(
+      input,
+      checkoutPath,
+      getRunnerLogger(true)
+    ),
    expected
  );
 });

-test("missingRegions", (t) => {
+test("missingRegions", async (t) => {
  // Run an end-to-end test on a test file
  let input = fs
    .readFileSync(`${__dirname}/../src/testdata/fingerprinting2.input.sarif`)
@ -209,7 +230,11 @@ test("missingRegions", (t) => {
  const checkoutPath = path.normalize(`${__dirname}/../src/testdata`);

  t.deepEqual(
-    fingerprints.addFingerprints(input, checkoutPath, getRunnerLogger(true)),
+    await fingerprints.addFingerprints(
+      input,
+      checkoutPath,
+      getRunnerLogger(true)
+    ),
    expected
  );
 });
--- a/src/fingerprints.ts
+++ b/src/fingerprints.ts
@ -34,9 +34,9 @@ type hashCallback = (lineNumber: number, hash: string) => void;
 * the hashes of the lines near the end of the file.
 *
 * @param callback function that is called with the line number (1-based) and hash for every line
- * @param input The file's contents
+ * @param filepath The path to the file to hash
 */
-export function hash(callback: hashCallback, input: string) {
+export async function hash(callback: hashCallback, filepath: string) {
  // A rolling view in to the input
  const window = Array(BLOCK_SIZE).fill(0);

@ -87,12 +87,11 @@ export function hash(callback: hashCallback, input: string) {
  // as we go. Once we reach a point in the window again then we've processed
  // BLOCK_SIZE characters and if the last character at this point in the window
  // was the start of a line then we should output the hash for that line.
-  for (let i = 0, len = input.length; i <= len; i++) {
-    let current = i === len ? 65535 : input.charCodeAt(i);
+  const processCharacter = function (current: number) {
    // skip tabs, spaces, and line feeds that come directly after a carriage return
    if (current === space || current === tab || (prevCR && current === lf)) {
      prevCR = false;
-      continue;
+      return;
    }
    // replace CR with LF
    if (current === cr) {
@ -113,7 +112,19 @@ export function hash(callback: hashCallback, input: string) {
      lineStart = true;
    }
    updateHash(current);
-  }
+  };
+
+  await new Promise((fulfill) => {
+    const readStream = fs.createReadStream(filepath, "utf8");
+    readStream.on("close", fulfill);
+    readStream.on("end", () => {
+      processCharacter(65535);
+    });
+    readStream.on("data", (data) => {
+      for (let i = 0; i < data.length; ++i)
+        processCharacter(data.charCodeAt(i));
+    });
+  });

  // Flush the remaining lines
  for (let i = 0; i < BLOCK_SIZE; i++) {
@ -237,11 +248,11 @@ export function resolveUriToFile(

 // Compute fingerprints for results in the given sarif file
 // and return an updated sarif file contents.
-export function addFingerprints(
+export async function addFingerprints(
  sarifContents: string,
  checkoutPath: string,
  logger: Logger
-): string {
+): Promise<string> {
  const sarif = JSON.parse(sarifContents);

  // Gather together results for the same file and construct
@ -263,6 +274,14 @@ export function addFingerprints(
        continue;
      }

+      if (
+        typeof primaryLocation?.physicalLocation?.region?.startLine ===
+        "undefined"
+      ) {
+        // Locations without a line number are unlikely to be source files
+        continue;
+      }
+
      const filepath = resolveUriToFile(
        primaryLocation.physicalLocation.artifactLocation,
        artifacts,
@ -289,8 +308,7 @@ export function addFingerprints(
        c(lineNumber, hashValue);
      }
    };
-    const fileContents = fs.readFileSync(filepath).toString();
-    hash(teeCallback, fileContents);
+    await hash(teeCallback, filepath);
  }

  return JSON.stringify(sarif);
--- a/src/testdata/fingerprinting2.expected.sarif
+++ b/src/testdata/fingerprinting2.expected.sarif
@ -30,9 +30,7 @@
                    "message": {
                        "text": "This header file should contain a header guard to prevent multiple inclusion."
                    },
-                    "partialFingerprints": {
-                        "primaryLocationLineHash": "599c824c91d0f75e:1"
-                    },
+                    "partialFingerprints": {},
                    "ruleId": "cpp/missing-header-guard",
                    "ruleIndex": 0
                }
--- a/src/upload-lib.ts
+++ b/src/upload-lib.ts
@ -356,7 +356,7 @@ async function uploadFiles(
  }

  let sarifPayload = combineSarifFiles(sarifFiles);
-  sarifPayload = fingerprints.addFingerprints(
+  sarifPayload = await fingerprints.addFingerprints(
    sarifPayload,
    checkoutPath,
    logger