[update]
This commit is contained in:
35
src_dataset/FILTER_LIST.mjs
Normal file
35
src_dataset/FILTER_LIST.mjs
Normal file
@@ -0,0 +1,35 @@
|
||||
export const FILTER_LIST = [
|
||||
"https://gitlab.com/contexttesting/zoroaster.git",
|
||||
"https://github.com/Eternity-Bots",
|
||||
"https://github.com/node-x-extras/x-path",
|
||||
"https://github.com/substack/node-x256",
|
||||
"https://github.com/substack/node-wordwrap",
|
||||
|
||||
"https://github.com/zkochan/packages/blob/main/which-pm-runs",
|
||||
"https://github.com/webpack-contrib/webpack-addons",
|
||||
"https://github.com/zznoillusion1026/MyImage",
|
||||
"https://codehub.devcloud.huaweicloud.com/jsztxm00001/zzb-vue-ui.git",
|
||||
"https://github.com/DZSF",
|
||||
"https://github.com/chuzhixin/zx-count",
|
||||
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.stat",
|
||||
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.scandir",
|
||||
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.walk",
|
||||
"https://github.com/nodelib/nodelib/tree/master/packages/fs/fs.macchiato",
|
||||
"https://github.com/substack/text-table",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-object-rest-spread",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-optional-catch-binding",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-async-generators",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-optional-chaining",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-json-strings",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-nullish-coalescing-operator",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-bigint",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-dynamic-import",
|
||||
"https://github.com/substack/node-commondir",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-plugin-syntax-export-namespace-from",
|
||||
"https://github.com/substack/https-browserify",
|
||||
"https://github.com/babel/babel/tree/master/packages/babel-runtime",
|
||||
"https://github.com/paulmillr/async-each",
|
||||
"https://github.com/yarnpkg/yarn/blob/master/packages",
|
||||
"https://github.com/substack/semver-compare",
|
||||
"https://github.com/substack/node-archy"
|
||||
];
|
46
src_dataset/batch.mjs
Normal file
46
src_dataset/batch.mjs
Normal file
@@ -0,0 +1,46 @@
|
||||
import { writeFile,open } from "node:fs/promises";
|
||||
|
||||
/**
|
||||
*
|
||||
* @template T
|
||||
* @template U
|
||||
* @param {T[]} items
|
||||
* @param {number} limit
|
||||
* @param {(T)=>Promise<U>} asyncCallback
|
||||
* @returns {Promise<U[]>}
|
||||
*/
|
||||
export async function processPromisesBatch(
|
||||
items,
|
||||
limit,
|
||||
asyncCallback,
|
||||
) {
|
||||
const results = [];
|
||||
const fileHandle = await open('cache/progress.txt',"w+");
|
||||
for (let start = 0; start < items.length; start += limit) {
|
||||
const end = start + limit > items.length ? items.length : start + limit;
|
||||
|
||||
const slicedResults = await Promise.all(items.slice(start, end).map(asyncCallback));
|
||||
|
||||
const writePromise = writeFile(fileHandle,transformRes(slicedResults),{flush:true});
|
||||
console.log(`[batch] finished batch [${start},${end})`)
|
||||
results.push(...slicedResults);
|
||||
await writePromise;
|
||||
}
|
||||
fileHandle.close();
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @template T
|
||||
* @param {Array<T>} results
|
||||
* @returns {string}
|
||||
*/
|
||||
function transformRes(results){
|
||||
let str = ""
|
||||
for(const x of results){
|
||||
str += JSON.stringify(x)+'\n';
|
||||
}
|
||||
return str;
|
||||
}
|
26
src_dataset/cache.mjs
Normal file
26
src_dataset/cache.mjs
Normal file
@@ -0,0 +1,26 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { readFile, writeFile } from "node:fs/promises";
|
||||
import { resolve } from "node:path";
|
||||
|
||||
/**
|
||||
*
|
||||
* @template T
|
||||
* @param {string} fileName
|
||||
* @param {()=>Promise<T>} asyncCallback
|
||||
* @returns {Promise<T>}
|
||||
*/
|
||||
export async function cacheFunctionOutput(fileName, asyncCallback) {
|
||||
const fileLoc = resolve('./cache', fileName);
|
||||
if (existsSync(fileLoc)) {
|
||||
console.log("[cacher] Using cached ", fileLoc);
|
||||
const fileContents = (await readFile(fileLoc)).toString();
|
||||
return JSON.parse(fileContents);
|
||||
} else {
|
||||
console.log("[cacher] cache miss")
|
||||
const returnRes = await asyncCallback();
|
||||
const fileContents = JSON.stringify(returnRes);
|
||||
await writeFile(fileLoc,fileContents);
|
||||
console.log("[cacher] saved ",fileLoc)
|
||||
return returnRes;
|
||||
}
|
||||
}
|
@@ -1,10 +1,44 @@
|
||||
import packages from 'download-counts' assert { type: 'json'}
|
||||
import * as csv from 'csv'
|
||||
import fsp from 'fs/promises'
|
||||
const packageList = Object.keys(packages).map(e => [e, packages[e]]).filter(e=>e[1]>500000).sort((e,f)=>(f[1]-e[1]));
|
||||
const packageMap = new Map(packageList)
|
||||
import { cloneRepoAndCheck } from './mine.mjs';
|
||||
import { cacheFunctionOutput } from './cache.mjs';
|
||||
import { processPromisesBatch } from './batch.mjs';
|
||||
|
||||
console.log(packageMap.size)
|
||||
|
||||
const output = csv.stringify(packageList)
|
||||
await fsp.writeFile('output.csv',output);
|
||||
|
||||
const intermediateRepoList = await cacheFunctionOutput('repos.json', async function () {
|
||||
const [packagesM, packageReposM] = await Promise.all([
|
||||
import('download-counts', { with:{type: 'json'}}),
|
||||
import('all-the-package-repos', { with: { type: 'json' } })
|
||||
]);
|
||||
const packages = packagesM.default;
|
||||
const packageRepos = packageReposM.default;
|
||||
|
||||
const packageList = Object.keys(packages).map(e => [e, packages[e]])
|
||||
.filter(e => e[1] > 100).filter(e => !e[0].startsWith("@types/"))
|
||||
console.log('packagelist', packageList.length)
|
||||
/**
|
||||
* @type {[string,string,number][]} repo, link count
|
||||
*/
|
||||
const withRepos = packageList.map(e => [e[0], packageRepos[e[0]], e[1]])
|
||||
console.log('withrepos', withRepos.length);
|
||||
const withExactRepos = withRepos.filter(e => ((e[1]) !== null && (e[1]) !== undefined && (e[1]) !== ""))
|
||||
console.log('withreposCleaned', withExactRepos.length);
|
||||
withExactRepos.sort((a,b)=>(-a[2]+b[2]))
|
||||
return withExactRepos;
|
||||
})
|
||||
// const packageMap = new Map(packageList)
|
||||
|
||||
console.log(intermediateRepoList.length)
|
||||
const intermediateRepoListSmaller = intermediateRepoList.slice(0,2000);
|
||||
|
||||
const repoStatus = await processPromisesBatch(intermediateRepoListSmaller,15,cloneRepoAndCheck)
|
||||
|
||||
const repoStatusString = csv.stringify(repoStatus);
|
||||
await fsp.writeFile('repostatus.csv', repoStatusString);
|
||||
|
||||
const minableRepositories = repoStatus.filter(e=>(e!==null && e?.[1]));
|
||||
const output = csv.stringify(minableRepositories);
|
||||
await fsp.writeFile('minableRepositories2.csv', output);
|
||||
// console.log("written results")
|
||||
|
||||
|
76
src_dataset/mine.mjs
Normal file
76
src_dataset/mine.mjs
Normal file
@@ -0,0 +1,76 @@
|
||||
import { existsSync, } from 'fs'
|
||||
import { lstat, readFile } from 'fs/promises'
|
||||
import git from 'git-client'
|
||||
import { resolve } from 'path'
|
||||
import int from 'set.prototype.intersection';
|
||||
import { FILTER_LIST } from './FILTER_LIST.mjs';
|
||||
/**
|
||||
*
|
||||
* @param {[string,string,number]} param0
|
||||
* @returns {Promise<[string,string|null]>}
|
||||
*/
|
||||
export async function cloneRepoAndCheck([repoName, repoGitUrl, downloadCount]) {
|
||||
const repoPath = resolve('cache/repos', repoName)
|
||||
|
||||
if (FILTER_LIST.includes(repoGitUrl)) {
|
||||
console.log("[git] ignoring ", repoName)
|
||||
return [repoName, null]
|
||||
};
|
||||
// console.log('[git] fetching',repoName, repoGitUrl);
|
||||
await cacheCloneIdempotently(repoPath, repoName, repoGitUrl)
|
||||
|
||||
const tsConfigFileLocation = resolve(repoPath,'tsconfig.json');
|
||||
const tsConfigFileExists = existsSync(tsConfigFileLocation);
|
||||
if(tsConfigFileExists) return [repoName, null];
|
||||
|
||||
|
||||
const packageFile = resolve(repoPath, 'package.json')
|
||||
if (!existsSync(packageFile)) return [repoName, null];
|
||||
const packageJSONContentsString = (await readFile(packageFile)).toString()
|
||||
|
||||
// console.log(packageJSONContentsString);
|
||||
const packageJSONContents = JSON.parse(packageJSONContentsString)
|
||||
// console.log(repoName, packageJSONContents.license)
|
||||
const hasDependencies = checkTestingDependencies(packageJSONContents, repoName);
|
||||
if (hasDependencies)
|
||||
return [repoName, ((packageJSONContents?.scripts?.test))]
|
||||
else return [repoName, null]
|
||||
}
|
||||
|
||||
function checkTestingDependencies(packageJSONContents, repoName) {
|
||||
const testingLibraries = new Set(['jest', 'mocha', 'chai', 'istanbul', 'vitest']);
|
||||
const dependencies = new Set();
|
||||
if (packageJSONContents.dependencies !== undefined) {
|
||||
for (const dep of Object.keys(packageJSONContents.dependencies)) {
|
||||
dependencies.add(dep)
|
||||
}
|
||||
}
|
||||
if (packageJSONContents.devDependencies !== undefined) {
|
||||
for (const dep of Object.keys(packageJSONContents.devDependencies)) {
|
||||
dependencies.add(dep)
|
||||
}
|
||||
}
|
||||
|
||||
// console.log(dependencies)
|
||||
/**
|
||||
* @type {Set}
|
||||
*/
|
||||
const x = int(testingLibraries, dependencies);
|
||||
// console.log(`join`, x)
|
||||
return x.size > 0;
|
||||
|
||||
}
|
||||
|
||||
async function cacheCloneIdempotently(repoPath, repoName, repoGitUrl) {
|
||||
if (existsSync(repoPath)) {
|
||||
const isDir = (await lstat(repoPath)).isDirectory()
|
||||
if (!isDir) throw new Error(repoName, " is mangled. delete directory and re-clone.")
|
||||
else {
|
||||
// const path = await git.status({ $cwd: repoPath })
|
||||
|
||||
}
|
||||
} else {
|
||||
console.log("[git] cloning", repoGitUrl);
|
||||
await git.clone(repoGitUrl, repoPath,{'single-branch':true,depth:1})
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user