diff --git a/src_dataset/FILTER_LIST.mjs b/src_dataset/FILTER_LIST.mjs index 6323f1c..788ccab 100644 --- a/src_dataset/FILTER_LIST.mjs +++ b/src_dataset/FILTER_LIST.mjs @@ -58,7 +58,51 @@ const FILTER_LIST = [ "https://github.com/spenceralger/rcfinder", "https://github.com/okta/okta-idx-js", "https://github.com/cssinjs/jss-compose", - "https://github.com/kaazing/node-http2" + "https://github.com/kaazing/node-http2", + "https://github.com/Financial-Times/polyfill-useragent-normaliser", + "https://github.com/cssinjs/jss-extend", + "https://github.com/cssinjs/jss-template", + "https://github.com/serverless/enterprise-plugin", + "https://github.com/financial-times/polyfill-library", + "https://github.com/trufflesuite/truffle/tree/master/packages/*", + "https://github.com/FlatFilers/platform-sdk-mono", + "https://github.com/rkusa/linebreaker", + "https://github.com/shahata/cdnjs-cdn-data", + "https://github.com/openapi-library/OpenAPIValidators/tree/master/packages/*", + "https://github.com/paypal/Checkout-NodeJS-SDK", + "https://github.com/chilijung/gulp-cssmin", + "https://github.com/shahata/jsdelivr-cdn-data", + "https://github.com/voltidev/broccoli-svg-optimizer", + "https://github.com/jonschlinkert/is-registered", + "https://github.com/helpers/helper-cache", + "https://github.com/paypal/PayPal-node-SDK", + "https://github.com/singular-labs/web-sdk", + "https://github.com/serverless/compose", + "https://github.com/jonschlinkert/base-config-process", + "https://github.com/jonschlinkert/base-cli-schema", + "https://github.com/jhermsmeier/node-vcf", + "https://github.com/jonschlinkert/base-namespace", + "https://github.com/holistics/dbdocs", + "https://github.com/mikevercoelen/gulp-sass-glob", + 'https://github.com/serverless/test', + "https://github.com/bower/registry-client", + "https://github.com/pa11y-reporter-html/pa11y-reporter-html", + "https://github.com/woocommerce/woocommerce-rest-api-js", + "https://github.com/cssinjs/jss-isolate", + "https://github.com/import-io/s3-deploy", + "https://github.com/thenativeweb/eslint-plugin-extended", + "https://github.com/merklejerk/bn-str", + "https://github.com/yahoo/gear", + "https://github.com/yahoo/gear-lib", + "https://github.com/nytimes/pretty-lights", + "https://github.com/mui-org/react-transition-group", + "https://github.com/logicalparadox/jq", + "https://github.com/wistia/eslint", + "https://github.wdf.sap.corp/xs2/node-hdbext.git", + "https://hg.mozilla.org/mozilla-central/", + "https://github.com/MikeKovarik/gulp-better-rollup", + "https://github.com/kellyselden/git-diff-apply", + "https://github.com/datocms/js-datocms-client" ]; const FILTER_LIST_REGEX = FILTER_LIST.map(GlobToRegExp) diff --git a/src_dataset/index.mjs b/src_dataset/index.mjs index b34e65a..b387a93 100644 --- a/src_dataset/index.mjs +++ b/src_dataset/index.mjs @@ -30,15 +30,15 @@ const intermediateRepoList = await cacheFunctionOutput('repos.n2.json', async fu // const packageMap = new Map(packageList) console.log(`Total repos`,intermediateRepoList.length) -const intermediateRepoListSmaller = intermediateRepoList.slice(0,20000); +const intermediateRepoListSmaller = intermediateRepoList.slice(0,60000); const repoStatus = await processPromisesBatch(intermediateRepoListSmaller,40,cloneRepoAndCheck) const repoStatusString = csv.stringify(repoStatus); -await fsp.writeFile('repostatus.csv', repoStatusString); +await fsp.writeFile('repostatus2.csv', repoStatusString); const minableRepositories = repoStatus.filter(e=>(e!==null && e?.[1])); const output = csv.stringify(minableRepositories); -await fsp.writeFile('minableRepositories2.csv', output); +await fsp.writeFile('minableRepositories3.csv', output); // console.log("written results") diff --git a/src_dataset/mine.mjs b/src_dataset/mine.mjs index fa003a0..132da49 100644 --- a/src_dataset/mine.mjs +++ b/src_dataset/mine.mjs @@ -15,7 +15,7 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) { const repoPath = resolve('../cache-repos/repos', repoName) if (filterRepo(repoGitUrl)) { - console.log("[git] ignoring ", repoName) + // console.log("[git] ignoring ", repoName) return [repoName, null] }; // console.log('[git] fetching',repoName, repoGitUrl); @@ -24,9 +24,12 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) { let packageJSONContentsString = null; - try{ - packageJSONContentsString = await cacheFunctionOutput(`cache-repo-package-json-${repoName.replaceAll('/',"_sl_")}.json`,async ()=> JSON.stringify(await repo.package()),true); + packageJSONContentsString = await cacheFunctionOutput(`cache-repo-package-json-${repoName.replaceAll('/',"_sl_")}.json`,async ()=>{ + // console.log(`[npm] fetching package.json for ${repoName} from npm`); + const packageJson = await repo.package(); + return JSON.stringify(packageJson); + },true); // console.log("[git] fetched package.json for", repoName); }catch(e){ throw new Error(`Failed to fetch package.json for ${repoName} from npm: ${e.message}`); @@ -48,7 +51,7 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) { return [repoName, null]; } - if(isLikelyTypescriptProject(packageJSONContents)) { + if(isUnwantedProject(packageJSONContents)) { await removeUnnecessaryClone(repoPath); // console.warn("[git] Ignoring ", repoName, "because it is a typescript project."); // console.log("Cleaned up ", repoPath); @@ -57,7 +60,11 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) { const hasDependencies = checkTestingDependencies(packageJSONContents, repoName); if (hasDependencies) { - await cacheCloneIdempotently(repoPath, repoName, repoGitUrl); + const gotCloned = await cacheCloneIdempotently(repoPath, repoName, repoGitUrl); + if (!gotCloned) { + console.warn("[git] Failed to clone ", repoName, "at", repoGitUrl); + return [repoName, null]; + } const tsConfigFileLocation = resolve(repoPath, 'tsconfig.json'); const tsConfigFileExists = existsSync(tsConfigFileLocation); @@ -79,13 +86,21 @@ export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) { return [repoName, null] } } +/** + * Filter by packages + * @param {string} packageName + */ +function filterPackage(packageName){ + return packageName.startsWith('typescript') || packageName.startsWith('node-gyp') +} -function isLikelyTypescriptProject(packageJSONContents) { +function isUnwantedProject(packageJSONContents) { + // Is typescript project? if (packageJSONContents.devDependencies !== undefined) { - if (Object.keys(packageJSONContents.devDependencies).some(e => e.startsWith('typescript'))) { + if (Object.keys(packageJSONContents.devDependencies).some(filterPackage)) { return true; } - if (Object.keys(packageJSONContents.dependencies).some(e => e.startsWith('typescript'))) { + if (packageJSONContents.dependencies !== undefined && Object.keys(packageJSONContents.dependencies).some(filterPackage)) { return true; } } @@ -137,13 +152,18 @@ function checkTestingDependencies(packageJSONContents, repoName) { async function cacheCloneIdempotently(repoPath, repoName, repoGitUrl) { if (existsSync(repoPath)) { const isDir = (await lstat(repoPath)).isDirectory() - if (!isDir) throw new Error(repoName, " is mangled. delete directory and re-clone.") - else { - // const path = await git.status({ $cwd: repoPath }) - // console.log("[git] already cloned", repoName, "at", repoPath); + if (!isDir) { + throw new Error(repoName, " is mangled. delete directory and re-clone.") } + return true; } else { - console.log("[git] cloning", repoGitUrl); - await git.clone(repoGitUrl, repoPath, { 'single-branch': true, depth: 1 }) + // console.log("[git] cloning", repoGitUrl); + try{ + await git.clone(repoGitUrl, repoPath, { 'single-branch': true, depth: 1 }) + return true; + }catch(e){ + console.log(e.message) + return false; + } } }