Skip to content

Commit

Permalink
Add alternatives key for listing other possible language matches (#19)
Browse files Browse the repository at this point in the history
Contains files that are classified using the fallback language and lists what languages it could be.
Lists files and a list of matches under files.alternatives.
Resolves #15
  • Loading branch information
Nixinova authored Jun 29, 2023
1 parent 9728955 commit daaf868
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 18 deletions.
3 changes: 3 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

## Next
- Add `alternatives` key to list all possible language matches for files that do not have a definite match.

## 2.5.6
*2023-06-28*
- Changed fetching of data files to fallback to using the packaged files if the fetch request fails ([#21](https://github.com/Nixinova/LinguistJS/issues/21)).
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"scripts": {
"download-files": "npx tsx@3 build/download-files",
"pre-publish": "npm run download-files && npm test && npm run test:perf",
"test:perf": "tsc && node test/perf",
"perf": "tsc && node test/perf",
"test": "tsc && node test/folder && echo --- && node test/unit"
},
"files": [
Expand Down
20 changes: 13 additions & 7 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,29 +38,35 @@ As an example, take the following file structure:
| | index.ts 2kB
| readme.md 3kB
| no-lang 10B
| x.pluginspec 10B
```

Running LinguistJS on this folder will return the following JSON:

```json
{
"files": {
"count": 4,
"bytes": 6010,
"count": 5,
"bytes": 6020,
"results": {
"/src/index.ts": "TypeScript",
"/src/cli.js": "JavaScript",
"/readme.md": "Markdown",
"/no-lang": null,
}
"/x.pluginspec": "Ruby",
},
"alternatives": {
".pluginspec": ["XML"],
},
},
"languages": {
"count": 3,
"bytes": 6000,
"bytes": 6010,
"results": {
"JavaScript": { "type": "programming", "bytes": 1000, "color": "#f1e05a" },
"TypeScript": { "type": "programming", "bytes": 2000, "color": "#2b7489" },
"Markdown": { "type": "prose", "bytes": 3000, "color": "#083fa1" },
"JavaScript": { "type": "programming", "bytes": 1000, "color": "#f1e05a" },
"Markdown": { "type": "prose", "bytes": 3000, "color": "#083fa1" },
"Ruby": { "type": "programming", "bytes": 10, "color": "#701516" },
"TypeScript": { "type": "programming", "bytes": 2000, "color": "#2b7489" },
},
},
"unknown": {
Expand Down
25 changes: 21 additions & 4 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
const extensions: Record<T.FilePath, string> = {};
const overrides: Record<T.FilePath, T.LanguageResult> = {};
const results: T.Results = {
files: { count: 0, bytes: 0, results: {} },
files: { count: 0, bytes: 0, results: {}, alternatives: {} },
languages: { count: 0, bytes: 0, results: {} },
unknown: { count: 0, bytes: 0, extensions: {}, filenames: {} },
};
Expand Down Expand Up @@ -157,8 +157,11 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
fileAssociations[file] = [];
extensions[file] = '';
}
const parent = !opts.childLanguages && result && langData[result].group || false;
fileAssociations[file].push(parent || result);
// Set parent to result group if it is present
// Is nullish if either `opts.childLanguages` is set or if there is no group
const finalResult = !opts.childLanguages && result && langData[result].group || result;
if (!fileAssociations[file].includes(finalResult))
fileAssociations[file].push(finalResult);
extensions[file] = paths.extname(file).toLowerCase();
};
const overridesArray = Object.entries(overrides);
Expand Down Expand Up @@ -279,12 +282,14 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
if (Array.isArray(heuristic.language)) {
heuristic.language = heuristic.language[0];
}

// Make sure the results includes this language
const languageGroup = langData[heuristic.language]?.group;
const matchesLang = fileAssociations[file].includes(heuristic.language);
const matchesParent = languageGroup && fileAssociations[file].includes(languageGroup);
if (!matchesLang && !matchesParent)
continue;

// Normalise heuristic data
const patterns: string[] = [];
const normalise = (contents: string | string[]) => patterns.push(...[contents].flat());
Expand All @@ -296,17 +301,29 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
if (data.named_pattern) normalise(heuristicsData.named_patterns[data.named_pattern]);
}
}

// Check file contents and apply heuristic patterns
const fileContent = opts.fileContent?.length ? opts.fileContent[files.indexOf(file)] : await readFile(file).catch(() => null);
// Skip if file read errors
if (fileContent === null) continue;
// Apply heuristics
if (!patterns.length || patterns.some(pattern => pcre(pattern).test(fileContent))) {
results.files.results[file] = heuristic.language;
break;
}
}
}
// If no heuristics, assign a language
results.files.results[file] ??= fileAssociations[file][0];
if (!results.files.results[file]) {
const possibleLangs = fileAssociations[file];
// Assign first language as a default option
const defaultLang = possibleLangs[0];
const alternativeLangs = possibleLangs.slice(1)
results.files.results[file] = defaultLang;
// List alternative languages if there are any
if (alternativeLangs.length > 0)
results.files.alternatives[file] = alternativeLangs;
}
}

// Skip specified categories
Expand Down
1 change: 1 addition & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export interface Results {
bytes: Bytes
/** Note: Results use slashes as delimiters even on Windows. */
results: Record<FilePath, LanguageResult>
alternatives: Record<FilePath, LanguageResult[]>
}
languages: {
count: Integer
Expand Down
13 changes: 9 additions & 4 deletions test/expected.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,27 @@
{
"files": {
"count": 8,
"bytes": 47,
"count": 9,
"bytes": 61,
"results": {
"~/al.al": "Perl",
"~/alternatives.asc": "AGS Script",
"~/file.txt": "JavaScript",
"~/folder/sub.txt": "Text",
"~/hashbang": "JavaScript",
"~/modeline.txt": "C++",
"~/Pipfile": "TOML",
"~/unknown": null
},
"alternatives": {
"~/alternatives.asc": [ "AsciiDoc", "Public Key" ]
}
},
"languages": {
"count": 5,
"bytes": 38,
"count": 6,
"bytes": 52,
"results": {
"Perl": { "type": "programming", "bytes": 0, "color": "#0298c3" },
"AGS Script": { "type": "programming", "bytes": 14, "color": "#B9D9FF" },
"JavaScript": { "type": "programming", "bytes": 23, "color": "#f1e05a" },
"Text": { "type": "prose", "bytes": 0 },
"C++": { "type": "programming", "bytes": 15, "color": "#f34b7d" },
Expand Down
4 changes: 2 additions & 2 deletions test/perf.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ async function perfTest() {
const amount = +process.argv[2] || 75;
for (let i = 0; i < amount; i++) {
let t1 = +new Date();
await linguist('.');
await linguist('.', { offline: true });
let t2 = +new Date();
time += t2 - t1;
}
const unit = 'ms';
const total = time;
const average = total / amount;
const EXPECTED_MAX = 75; // 2.3
const EXPECTED_MAX = 100; // 2.6
console.log('\n<Performance test results>');
console.log('Total:', total, unit, `(n=${amount})`);
console.log('Average:', average, unit);
Expand Down
1 change: 1 addition & 0 deletions test/samples/alternatives.asc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Alternatives
2 changes: 2 additions & 0 deletions test/unit.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ async function test([filename, fileContent = ''], [type, testVal]) {
'files': actual.files.results[filename],
'size': actual.files.bytes,
'count': actual.files.count,
'alternatives_count': Object.entries(actual.files.alternatives).length,
}[type];
const result = testContent === testVal;
i = `${+i + 1}`.padStart(2, '0');
Expand All @@ -36,6 +37,7 @@ async function unitTest() {
await test(['x.cpp'], ['files', 'C++']);
await test(['x.c'], ['files', 'C']);
await test(['x.R'], ['files', 'R']);
await test(['.m'], ['alternatives_count', 1])
desc('filenames');
await test(['Dockerfile'], ['files', 'Dockerfile']);
await test(['CMakeLists.txt'], ['files', 'CMake']);
Expand Down

0 comments on commit daaf868

Please sign in to comment.