[fix] ignore clone failures

This commit is contained in:
2025-08-25 23:45:34 +01:00
parent 09adbb78f1
commit 01b6902677
3 changed files with 82 additions and 18 deletions

View File

@@ -58,7 +58,51 @@ const FILTER_LIST = [
"https://github.com/spenceralger/rcfinder",
"https://github.com/okta/okta-idx-js",
"https://github.com/cssinjs/jss-compose",
"https://github.com/kaazing/node-http2"
"https://github.com/kaazing/node-http2",
"https://github.com/Financial-Times/polyfill-useragent-normaliser",
"https://github.com/cssinjs/jss-extend",
"https://github.com/cssinjs/jss-template",
"https://github.com/serverless/enterprise-plugin",
"https://github.com/financial-times/polyfill-library",
"https://github.com/trufflesuite/truffle/tree/master/packages/*",
"https://github.com/FlatFilers/platform-sdk-mono",
"https://github.com/rkusa/linebreaker",
"https://github.com/shahata/cdnjs-cdn-data",
"https://github.com/openapi-library/OpenAPIValidators/tree/master/packages/*",
"https://github.com/paypal/Checkout-NodeJS-SDK",
"https://github.com/chilijung/gulp-cssmin",
"https://github.com/shahata/jsdelivr-cdn-data",
"https://github.com/voltidev/broccoli-svg-optimizer",
"https://github.com/jonschlinkert/is-registered",
"https://github.com/helpers/helper-cache",
"https://github.com/paypal/PayPal-node-SDK",
"https://github.com/singular-labs/web-sdk",
"https://github.com/serverless/compose",
"https://github.com/jonschlinkert/base-config-process",
"https://github.com/jonschlinkert/base-cli-schema",
"https://github.com/jhermsmeier/node-vcf",
"https://github.com/jonschlinkert/base-namespace",
"https://github.com/holistics/dbdocs",
"https://github.com/mikevercoelen/gulp-sass-glob",
'https://github.com/serverless/test',
"https://github.com/bower/registry-client",
"https://github.com/pa11y-reporter-html/pa11y-reporter-html",
"https://github.com/woocommerce/woocommerce-rest-api-js",
"https://github.com/cssinjs/jss-isolate",
"https://github.com/import-io/s3-deploy",
"https://github.com/thenativeweb/eslint-plugin-extended",
"https://github.com/merklejerk/bn-str",
"https://github.com/yahoo/gear",
"https://github.com/yahoo/gear-lib",
"https://github.com/nytimes/pretty-lights",
"https://github.com/mui-org/react-transition-group",
"https://github.com/logicalparadox/jq",
"https://github.com/wistia/eslint",
"https://github.wdf.sap.corp/xs2/node-hdbext.git",
"https://hg.mozilla.org/mozilla-central/",
"https://github.com/MikeKovarik/gulp-better-rollup",
"https://github.com/kellyselden/git-diff-apply",
"https://github.com/datocms/js-datocms-client"
];
const FILTER_LIST_REGEX = FILTER_LIST.map(GlobToRegExp)

View File

@@ -30,15 +30,15 @@ const intermediateRepoList = await cacheFunctionOutput('repos.n2.json', async fu
// const packageMap = new Map(packageList)
console.log(`Total repos`,intermediateRepoList.length)
const intermediateRepoListSmaller = intermediateRepoList.slice(0,20000);
const intermediateRepoListSmaller = intermediateRepoList.slice(0,60000);
const repoStatus = await processPromisesBatch(intermediateRepoListSmaller,40,cloneRepoAndCheck)
const repoStatusString = csv.stringify(repoStatus);
await fsp.writeFile('repostatus.csv', repoStatusString);
await fsp.writeFile('repostatus2.csv', repoStatusString);
const minableRepositories = repoStatus.filter(e=>(e!==null && e?.[1]));
const output = csv.stringify(minableRepositories);
await fsp.writeFile('minableRepositories2.csv', output);
await fsp.writeFile('minableRepositories3.csv', output);
// console.log("written results")

View File

@@ -15,7 +15,7 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
const repoPath = resolve('../cache-repos/repos', repoName)
if (filterRepo(repoGitUrl)) {
console.log("[git] ignoring ", repoName)
// console.log("[git] ignoring ", repoName)
return [repoName, null]
};
// console.log('[git] fetching',repoName, repoGitUrl);
@@ -24,9 +24,12 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
let packageJSONContentsString = null;
try{
packageJSONContentsString = await cacheFunctionOutput(`cache-repo-package-json-${repoName.replaceAll('/',"_sl_")}.json`,async ()=> JSON.stringify(await repo.package()),true);
packageJSONContentsString = await cacheFunctionOutput(`cache-repo-package-json-${repoName.replaceAll('/',"_sl_")}.json`,async ()=>{
// console.log(`[npm] fetching package.json for ${repoName} from npm`);
const packageJson = await repo.package();
return JSON.stringify(packageJson);
},true);
// console.log("[git] fetched package.json for", repoName);
}catch(e){
throw new Error(`Failed to fetch package.json for ${repoName} from npm: ${e.message}`);
@@ -48,7 +51,7 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
return [repoName, null];
}
if(isLikelyTypescriptProject(packageJSONContents)) {
if(isUnwantedProject(packageJSONContents)) {
await removeUnnecessaryClone(repoPath);
// console.warn("[git] Ignoring ", repoName, "because it is a typescript project.");
// console.log("Cleaned up ", repoPath);
@@ -57,7 +60,11 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
const hasDependencies = checkTestingDependencies(packageJSONContents, repoName);
if (hasDependencies) {
await cacheCloneIdempotently(repoPath, repoName, repoGitUrl);
const gotCloned = await cacheCloneIdempotently(repoPath, repoName, repoGitUrl);
if (!gotCloned) {
console.warn("[git] Failed to clone ", repoName, "at", repoGitUrl);
return [repoName, null];
}
const tsConfigFileLocation = resolve(repoPath, 'tsconfig.json');
const tsConfigFileExists = existsSync(tsConfigFileLocation);
@@ -79,13 +86,21 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
return [repoName, null]
}
}
/**
* Filter by packages
* @param {string} packageName
*/
function filterPackage(packageName){
return packageName.startsWith('typescript') || packageName.startsWith('node-gyp')
}
function isLikelyTypescriptProject(packageJSONContents) {
function isUnwantedProject(packageJSONContents) {
// Is typescript project?
if (packageJSONContents.devDependencies !== undefined) {
if (Object.keys(packageJSONContents.devDependencies).some(e => e.startsWith('typescript'))) {
if (Object.keys(packageJSONContents.devDependencies).some(filterPackage)) {
return true;
}
if (Object.keys(packageJSONContents.dependencies).some(e => e.startsWith('typescript'))) {
if (packageJSONContents.dependencies !== undefined && Object.keys(packageJSONContents.dependencies).some(filterPackage)) {
return true;
}
}
@@ -137,13 +152,18 @@ function checkTestingDependencies(packageJSONContents, repoName) {
async function cacheCloneIdempotently(repoPath, repoName, repoGitUrl) {
if (existsSync(repoPath)) {
const isDir = (await lstat(repoPath)).isDirectory()
if (!isDir) throw new Error(repoName, " is mangled. delete directory and re-clone.")
else {
// const path = await git.status({ $cwd: repoPath })
// console.log("[git] already cloned", repoName, "at", repoPath);
if (!isDir) {
throw new Error(repoName, " is mangled. delete directory and re-clone.")
}
return true;
} else {
console.log("[git] cloning", repoGitUrl);
await git.clone(repoGitUrl, repoPath, { 'single-branch': true, depth: 1 })
// console.log("[git] cloning", repoGitUrl);
try{
await git.clone(repoGitUrl, repoPath, { 'single-branch': true, depth: 1 })
return true;
}catch(e){
console.log(e.message)
return false;
}
}
}