This commit is contained in:
2025-07-26 13:44:32 +01:00
parent 5b584a90a5
commit 2d02acacc7
24 changed files with 4519 additions and 77 deletions

View File

@@ -0,0 +1,35 @@
export const FILTER_LIST = [
"https://gitlab.com/contexttesting/zoroaster.git",
"https://github.com/Eternity-Bots",
"https://github.com/node-x-extras/x-path",
"https://github.com/substack/node-x256",
"https://github.com/substack/node-wordwrap",
"https://github.com/zkochan/packages/blob/main/which-pm-runs",
"https://github.com/webpack-contrib/webpack-addons",
"https://github.com/zznoillusion1026/MyImage",
"https://codehub.devcloud.huaweicloud.com/jsztxm00001/zzb-vue-ui.git",
"https://github.com/DZSF",
"https://github.com/chuzhixin/zx-count",
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.stat",
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.scandir",
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.walk",
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.macchiato",
"https://github.com/substack/text-table",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-object-rest-spread",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-optional-catch-binding",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-async-generators",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-optional-chaining",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-json-strings",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-nullish-coalescing-operator",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-bigint",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-dynamic-import",
"https://github.com/substack/node-commondir",
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-export-namespace-from",
"https://github.com/substack/https-browserify",
"https://github.com/babel/babel/tree/master/packages/babel-runtime",
"https://github.com/paulmillr/async-each",
"https://github.com/yarnpkg/yarn/blob/master/packages",
"https://github.com/substack/semver-compare",
"https://github.com/substack/node-archy"
];

46
src_dataset/batch.mjs Normal file
View File

@@ -0,0 +1,46 @@
import { writeFile,open } from "node:fs/promises";
/**
*
* @template T
* @template U
* @param {T[]} items
* @param {number} limit
* @param {(T)=>Promise<U>} asyncCallback
* @returns {Promise<U[]>}
*/
export async function processPromisesBatch(
items,
limit,
asyncCallback,
) {
const results = [];
const fileHandle = await open('cache/progress.txt',"w+");
for (let start = 0; start < items.length; start += limit) {
const end = start + limit > items.length ? items.length : start + limit;
const slicedResults = await Promise.all(items.slice(start, end).map(asyncCallback));
const writePromise = writeFile(fileHandle,transformRes(slicedResults),{flush:true});
console.log(`[batch] finished batch [${start},${end})`)
results.push(...slicedResults);
await writePromise;
}
fileHandle.close();
return results;
}
/**
* @template T
* @param {Array<T>} results
* @returns {string}
*/
function transformRes(results){
let str = ""
for(const x of results){
str += JSON.stringify(x)+'\n';
}
return str;
}

26
src_dataset/cache.mjs Normal file
View File

@@ -0,0 +1,26 @@
import { existsSync } from "node:fs";
import { readFile, writeFile } from "node:fs/promises";
import { resolve } from "node:path";
/**
*
* @template T
* @param {string} fileName
* @param {()=>Promise<T>} asyncCallback
* @returns {Promise<T>}
*/
export async function cacheFunctionOutput(fileName, asyncCallback) {
const fileLoc = resolve('./cache', fileName);
if (existsSync(fileLoc)) {
console.log("[cacher] Using cached ", fileLoc);
const fileContents = (await readFile(fileLoc)).toString();
return JSON.parse(fileContents);
} else {
console.log("[cacher] cache miss")
const returnRes = await asyncCallback();
const fileContents = JSON.stringify(returnRes);
await writeFile(fileLoc,fileContents);
console.log("[cacher] saved ",fileLoc)
return returnRes;
}
}

View File

@@ -1,10 +1,44 @@
import packages from 'download-counts' assert { type: 'json'}
import * as csv from 'csv'
import fsp from 'fs/promises'
const packageList = Object.keys(packages).map(e => [e, packages[e]]).filter(e=>e[1]>500000).sort((e,f)=>(f[1]-e[1]));
const packageMap = new Map(packageList)
import { cloneRepoAndCheck } from './mine.mjs';
import { cacheFunctionOutput } from './cache.mjs';
import { processPromisesBatch } from './batch.mjs';
console.log(packageMap.size)
const output = csv.stringify(packageList)
await fsp.writeFile('output.csv',output);
const intermediateRepoList = await cacheFunctionOutput('repos.json', async function () {
const [packagesM, packageReposM] = await Promise.all([
import('download-counts', { with:{type: 'json'}}),
import('all-the-package-repos', { with: { type: 'json' } })
]);
const packages = packagesM.default;
const packageRepos = packageReposM.default;
const packageList = Object.keys(packages).map(e => [e, packages[e]])
.filter(e => e[1] > 100).filter(e => !e[0].startsWith("@types/"))
console.log('packagelist', packageList.length)
/**
* @type {[string,string,number][]} repo, link count
*/
const withRepos = packageList.map(e => [e[0], packageRepos[e[0]], e[1]])
console.log('withrepos', withRepos.length);
const withExactRepos = withRepos.filter(e => ((e[1]) !== null && (e[1]) !== undefined && (e[1]) !== ""))
console.log('withreposCleaned', withExactRepos.length);
withExactRepos.sort((a,b)=>(-a[2]+b[2]))
return withExactRepos;
})
// const packageMap = new Map(packageList)
console.log(intermediateRepoList.length)
const intermediateRepoListSmaller = intermediateRepoList.slice(0,2000);
const repoStatus = await processPromisesBatch(intermediateRepoListSmaller,15,cloneRepoAndCheck)
const repoStatusString = csv.stringify(repoStatus);
await fsp.writeFile('repostatus.csv', repoStatusString);
const minableRepositories = repoStatus.filter(e=>(e!==null && e?.[1]));
const output = csv.stringify(minableRepositories);
await fsp.writeFile('minableRepositories2.csv', output);
// console.log("written results")

76
src_dataset/mine.mjs Normal file
View File

@@ -0,0 +1,76 @@
import { existsSync, } from 'fs'
import { lstat, readFile } from 'fs/promises'
import git from 'git-client'
import { resolve } from 'path'
import int from 'set.prototype.intersection';
import { FILTER_LIST } from './FILTER_LIST.mjs';
/**
*
* @param {[string,string,number]} param0
* @returns {Promise<[string,string|null]>}
*/
export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
const repoPath = resolve('cache/repos', repoName)
if (FILTER_LIST.includes(repoGitUrl)) {
console.log("[git] ignoring ", repoName)
return [repoName, null]
};
// console.log('[git] fetching',repoName, repoGitUrl);
await cacheCloneIdempotently(repoPath, repoName, repoGitUrl)
const tsConfigFileLocation = resolve(repoPath,'tsconfig.json');
const tsConfigFileExists = existsSync(tsConfigFileLocation);
if(tsConfigFileExists) return [repoName, null];
const packageFile = resolve(repoPath, 'package.json')
if (!existsSync(packageFile)) return [repoName, null];
const packageJSONContentsString = (await readFile(packageFile)).toString()
// console.log(packageJSONContentsString);
const packageJSONContents = JSON.parse(packageJSONContentsString)
// console.log(repoName, packageJSONContents.license)
const hasDependencies = checkTestingDependencies(packageJSONContents, repoName);
if (hasDependencies)
return [repoName, ((packageJSONContents?.scripts?.test))]
else return [repoName, null]
}
function checkTestingDependencies(packageJSONContents, repoName) {
const testingLibraries = new Set(['jest', 'mocha', 'chai', 'istanbul', 'vitest']);
const dependencies = new Set();
if (packageJSONContents.dependencies !== undefined) {
for (const dep of Object.keys(packageJSONContents.dependencies)) {
dependencies.add(dep)
}
}
if (packageJSONContents.devDependencies !== undefined) {
for (const dep of Object.keys(packageJSONContents.devDependencies)) {
dependencies.add(dep)
}
}
// console.log(dependencies)
/**
* @type {Set}
*/
const x = int(testingLibraries, dependencies);
// console.log(`join`, x)
return x.size > 0;
}
async function cacheCloneIdempotently(repoPath, repoName, repoGitUrl) {
if (existsSync(repoPath)) {
const isDir = (await lstat(repoPath)).isDirectory()
if (!isDir) throw new Error(repoName, " is mangled. delete directory and re-clone.")
else {
// const path = await git.status({ $cwd: repoPath })
}
} else {
console.log("[git] cloning", repoGitUrl);
await git.clone(repoGitUrl, repoPath,{'single-branch':true,depth:1})
}
}