-
Notifications
You must be signed in to change notification settings - Fork 0
/
nomos.h
587 lines (523 loc) · 16.5 KB
/
nomos.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
/***************************************************************
Copyright (C) 2006-2014 Hewlett-Packard Development Company, L.P.
Copyright (C) 2014, Siemens AG
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
***************************************************************/
/**
* \dir
* \brief Nomos agent source
* \file
* \brief Nomos header file
* \page nomos Nomos agent
* \tableofcontents
* \section nomosabout About
* Nomos does license identification using short phrases (regular expressions)
* and heuristics (e.g. phrase must be found in (or out of) proximity to
* another phrase or phrases).
*
* Nomos may also identify a "style" type of license if it has similarities
* with a known license type.
*
* The signatures which uniquely identify a license are stored in `STRINGS.in`
* file.
*
* If you wish to contribute to Nomos, please read [How to add a new license
* signature](https://github.com/fossology/fossology/wiki/Nomos).
*
* \subsection nomosdebug Debugging
* Activating the defines of PROC_TRACE and/or DOCTOR_DEBUG (see line 31 of
* file parse.c). Nomos generates lot of tracing information that is really
* helpful to debug it.
*
* `PROC_TRACE` will show you, for example, which regex's were tried and which
* are successful. To see the successful matches, grep the output file for
* "addRef".
*
* `DOCTOR_DEBUG` will show you the before and after versions of the buffer to
* be processed. Look in the output file or `----- [Dr-BEFORE:] -----` and
* `+++++ [Dr-AFTER] +++:`
*
* \section nomosactions Supported actions
*
* | Command line flag | Description |
* | ---: | :--- |
* | -i | Initialize the database, then exit. |
* | -c | Specify the directory for the system configuration. |
* | -l | Print full file path (command line only). |
* | -v | Verbose (-vv = more verbose). |
* | -J | Output in JSON. |
* | -S | Print Highlightinfo to stdout . |
* | file | If files are listed, print the licenses detected within them. |
* | no file | Process data from the scheduler. |
* | -V | Print the version info, then exit. |
* | -d | Specify a directory to scan. |
* | -n | Spaw n - 1 child processes to run, there will be n running
* processes(the parent and n - 1 children). \n The default n is 2(when n is
* less than 2 or not setting, will be changed to 2) when -d is specified. |
* \section nomossource Agent source
* - \link src/nomos/agent \endlink
* - \link src/nomos/ui \endlink
* - Functional test cases \link src/nomos/agent_tests/Functional \endlink
* - Unit test cases \link src/nomos/agent_tests/Unit \endlink
*/
#ifndef _NOMOS_H
#define _NOMOS_H 1
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif /* not defined _GNU_SOURCE */
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
#include <search.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <dirent.h>
#include <ftw.h>
#include <regex.h>
#include <getopt.h>
#include <time.h>
#include <libgen.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/time.h>
#include "nomos_gap.h"
#include <stdbool.h>
#include <semaphore.h>
#include "json_writer.h"
/** Use nomos in standalone mode (no FOSSology DB) */
#ifdef STANDALONE
#include "standalone.h"
#else
#include <libfossology.h>
#include <libfossdbmanager.h>
#endif
#include <glib.h>
/**
* To use our local version of debug-malloc(), compile -DMEMORY_TRACING
*/
#ifdef MEMORY_TRACING
#include "DMalloc.h"
#endif /* MEMORY_TRACING */
#define PRECHECK
#define GPLV2_BEATS_GPLV3
#define SAVE_UNCLASSIFIED_LICENSES
/*#define FLAG_NO_COPYRIGHT*/
#ifdef PROC_TRACE_SWITCH
#define PROC_TRACE
#endif /* PROC_TRACE_SWITCH */
#define myBUFSIZ 4096 ///< Buffer max length
#define MAX_RENAME 1000 ///< Max rename length
#define TEMP_FILE_LEN 100 ///< Max temp file length
/** MAX_SCANBYTES is the maximum number of bytes that will be scanned
* in a file. Historically, we have never found a license more than
* 64k into a file.
*/
#define MAX_SCANBYTES 1024*1024
/**
* Program options and flags
*
* MD: I think these are used when making nomos
*/
#define OPTS_DEBUG 0x1
#define OPTS_TRACE_SWITCH 0x2
#define OPTS_LONG_CMD_OUTPUT 0x4
#define OPTS_HIGHLIGHT_STDOUT 0x8
#define OPTS_NO_HIGHLIGHTINFO 0x10
#define OPTS_JSON_OUTPUT 0x20
#define OPTS_SCANNING_DIRECTORY 0x40
extern char debugStr[myBUFSIZ]; ///< Debug string
extern char dbErrString[myBUFSIZ]; ///< DB error string
extern char saveLics[myBUFSIZ]; ///< License string
extern size_t hashEntries; ///< Hash entries
/**
Flags for program control
*/
#define FL_SAVEBASE 0x20
#define FL_FRAGMENT 0x40
#define FL_SHOWMATCH 0x80
#define FL_NOCOPYRIGHT 0x100
/**
* Names of various files/dirs created/used
*/
#define FILE_FOUND "Found.txt"
#define FILE_SCORES "_scores"
#define DEBUGLOG "/tmp/NomosDebugLog"
/**
* Symbolic Boolean value for NO
*/
#define NO 0
/**
* Symbolic Boolean value for YES
*/
#define YES 1
/* List-sorting flags */
#define UNSORTED 0
#define SORT_BY_NAME 1
#define SORT_BY_NAME_ICASE 2
#define SORT_BY_COUNT_DSC 3
#define SORT_BY_COUNT_ASC 4
#define SORT_BY_ALIAS 5
#define SORT_BY_BASENAME 6
/* Interest level (licenses) */
#define IL_HIGH 3
#define IL_MED 2
#define IL_LOW 1
#define IL_NONE 0
#define IL_INIT -1
/**
* license-text search results (ltsr) stuff
*/
#define LTSR_RMASK ((char) 1) /**< True if it's been matched */
#define LTSR_SMASK ((char) 2) /**< True if it's been searched for */
#define LTSR_YES ((char) 3) /**< Both searched, and matched */
#define LTSR_NO LTSR_SMASK /**< Searched but not matched */
/**
* Miscellaneous strings used in various modules
*/
#define STR_NOTPKG "None (not an rpm-format package)"
/*
* License-scanning limits
*/
#define _scCOMFORT 9 /**< >= 9 --> certain it's a license */
#define _scINVALID 4 /**< < 4 --> probably NOT a license */
/**
* LS_ = License Summaries/Strings
*/
#define LS_NONE "None"
#define LS_UNLIKELY "LikelyNot"
#define LS_NOSUM "No_license_found"
#define LS_UNCL "UnclassifiedLicense"
#define LS_NOT_PD "NOT-public-domain"
#define LS_PD_CLM "Public-domain"
#define LS_PD_CPRT "Public-domain(C)"
#define LS_PD_ONLY "Public-domain-ref"
#define LS_CPRTONLY "Misc-Copyright"
#define LS_TDMKONLY "Trademark-ref"
#define LS_LICRONLY "License-ref"
#define LS_PATRONLY "Patent-ref"
/*
* NULL values
*/
#define NULL_ITEM (item_t *) NULL ///< NULL item
#define NULL_LIST (list_t *) NULL ///< NULL list
#define NULL_FH (fh_t *) NULL ///< NULL fh
#define NULL_CHAR '\0' ///< NULL character
#define NULL_STR (char *) NULL ///< NULL string
/**
* Macros needed across >1 source module
*/
#define isEOL(x) (((x == '\n') || (x == '\r') || (x == '\v'))) ///< Check if x points to a EOL character
#define IS_HUGE(x) (x >= gl.blkUpperLimit)
#define NOMOS_TEMP "/tmp/nomos.tempdir"
#define NOMOS_TLOCK "/tmp/nomos.tempdir/.lock.tmp," /**< CDB, This goes away. */
/**
* Caches memory-mapped files
*/
struct mm_cache {
int inUse; ///< Cache in use
int fd; ///< File descriptor
unsigned long size; ///< Size
void *mmPtr; ///< Memory pointer
char label[myBUFSIZ]; ///< Label
};
/*
CDB - This is kind of tricky, the way it uses the same fields for
different meanings. If we had objects, we could subclass. It works
okay, but is just a PITA for debugging.
*/
/**
listitem item_t
\brief tricky data structure used for a list of 'items'
Meanings of val fields are dependent on the particular list --
See #defines below for examples.
*/
struct listitem {
int val;
int val2;
int val3;
char *str; /**< primary key for list-element */
void *buf; /**< alias, extra data, whatever */
};
typedef struct listitem item_t;
/**
Defines for the list val fields
*/
#define seqNo val
#define foundTool val
#define refCount val
#define num val
#define bStart val
#define iFlag val
#define ssComp val2
#define isProcessed val2
#define iLevel val2
#define nMatch val2
#define bLen val2
#define bDocLen val3
#define bIndex val3
#define bList buf
/**
list
\brief list_t type structure used to keep various lists. (e.g. there are
multiple lists).
*/
struct list {
char name[64]; /**< Name of the list */
int used; /**< Number of items found, 0 is empty list */
int size; /**< What size is this? (MD) */
int ix; /**< The index for the items below */
int sorted; /**< Flag to indicate how ?? (the list or the items in the
list?) things are sorted: SORT_BY_NAME or
SORT_BY_NAME_ICASE */
int desc; /**< Description */
item_t *items; /**< List items */
};
typedef struct list list_t;
/**
* Search string
*/
struct searchString {
int csLen; ///< String length
char *csData; ///< String data
};
typedef struct searchString searchString_t;
/**
* License specification
*/
struct licenseSpec {
searchString_t seed; ///< License seed
searchString_t text; ///< License text
};
typedef struct licenseSpec licSpec_t;
/**
\brief Structure holding data truly global in that it remains consistent
for each file scanned.
*/
struct globals {
char initwd[myBUFSIZ]; ///< CDB, would like to workaround/eliminate.
char progName[64]; ///< Program name
int progOpts; ///< CLI options
int flags; ///< Flags
int uPsize; ///< Size
#ifdef GLOBAL_DEBUG
int DEEBUG;
int MEM_DEEBUG;
#endif /* GLOBAL_DEBUG */
#ifdef PROC_TRACE_SWITCH
int ptswitch;
#endif /* PROC_TRACE_SWITCH */
list_t sHash; ///< Hashes
/* Agent-specific Things */
int agentPk; ///< Agent id
long uploadFk; ///< Upload id
int arsPk; ///< Agent ars id
PGconn *pgConn; ///< DB Connection
fo_dbManager *dbManager; ///< FOSSology DB manager
};
/**
* License match positions and license type
*/
typedef struct {
int start; ///< Start position of match
int end; ///< End position of match
int index; ///< Enums from index (Entrynumber) in STRINGS.in
} MatchPositionAndType;
/**
* License matches
*/
typedef struct {
GArray* matchPositions; ///< Match positions
GArray* indexList; ///< License indexes
char* licenceName; ///< License names
int licenseFileId; ///< PFile id
} LicenceAndMatchPositions;
/**
curScan
\brief Struct that tracks state related to current file being scanned.
*/
struct curScan {
char cwd[myBUFSIZ]; /**< CDB, Would like to workaround and eliminate. */
char targetDir[myBUFSIZ]; /**< Directory where file is */ /* check */
char targetFile[myBUFSIZ]; /**< File we're scanning (tmp file)*/ /* check */
char filePath[myBUFSIZ]; /**< the original file path passed in */
long pFileFk; /**< [in] pfile_fk from scheduler */
char pFile[myBUFSIZ]; /**< [in] pfilename from scheduler */
char *licPara;
char *matchBase;
size_t targetLen;
size_t cwdLen;
struct stat stbuf;
regmatch_t regm;
list_t regfList;
list_t fLicFoundMap;
list_t parseList;
list_t offList;
list_t lList;
char compLic[myBUFSIZ]; /**< the license(s) found, None or NotLikely.
comma separated if multiple names are found. */
int nLines;
int cliMode; /**< boolean to indicate running from command line */
char *tmpLics; /**< pointer to storage for parsed names */
char *licenseList[512]; /**< list of license names found, can be a single name */
GArray* indexList; /**< List of license indexes */
GArray* theMatches; /**< List of matches */
GArray* keywordPositions; /**< List of matche positions */
GArray* docBufferPositionsAndOffsets;
int currentLicenceIndex;
FILE *tempJsonPath; /**< File descriptor for temporary file where
intermediate outputs for json are stored */
sem_t mutexTempJson; /**< Mutex to handle writes to tempJsonPath */
};
/**
* License pattern
*/
struct license {
int len; ///< Length of pattern
char *patt; ///< License pattern to use
};
/**
* License text to information
*/
struct licensetext {
char *regex; ///< License regex
char *tseed; ///< unencrypted license text
int nAbove;
int nBelow;
int compiled;
int plain;
};
typedef struct licensetext licText_t;
/**
* Get regex of a license text
*/
#define _REGEX(x) licText[x].regex
/**
* Get seed of a license text
*/
#define _SEED(x) licText[x].tseed
/**
* License scan result
*/
struct scanResults {
int score; ///< License match score
int kwbm;
int size;
int flag; ///< Flags
int dataOffset;
char fullpath[myBUFSIZ];
char linkname[16];
char *licenses;
char *relpath;
size_t nameOffset;
};
typedef struct scanResults scanres_t;
/**
* List-based memory tags
*/
#define MTAG_UNSORTKEY "list/str (initially-UNsorted key)"
#define MTAG_SORTKEY "list/str (initially-sorted key)"
#define MTAG_LISTKEY "list/str (sorted/unsorted key)"
#define MTAG_REPLKEY "list/str (replaced primary key)"
#define MTAG_LISTBUF "list/buf (any data)"
#define MTAG_PATHBASE "list/buf (path basename)"
#define MTAG_PKGINFO "list/buf (pkg rname/type/name/vers/lic)"
#define MTAG_PKG_NV "list/buf (pkg name/vers)"
#define MTAG_MD5SUM "list/buf (distro-arch MD5SUM)"
#define MTAG_COUNTER "list/buf integer (counter)"
#define MTAG_PKGNAME "list/buf (package-name)"
#define MTAG_PKGVERS "list/buf (package-vers)"
#define MTAG_CLAIMLIC "list/buf (claimlic copy)"
#define MTAG_COMPLIC "list/buf (pkg compLic copy)"
#define MTAG_URLCOPY "list/buf (pkg URL copy)"
#define MTAG_FILELIC "list/buf (file-license copy)"
#define MTAG_FIXNAME "list/buf (fixed-package name)"
/**
* Miscellaneous memory tags
*/
#define MTAG_SEEDTEXT "search-seed text"
#define MTAG_SRCHTEXT "license-search text"
#define MTAG_MMAPFILE "mmap-file data"
#define MTAG_MAGICDATA "file magic description"
#define MTAG_PATTRS "pkg-attr buffer"
#define MTAG_DOUBLED "doubled (reallocated) data"
#define MTAG_SEARCHBUF "initial search-data buffer"
#define MTAG_TOOSMALL "too-small half-size buffer"
#define MTAG_TEXTPARA "paragraph text"
#define MTAG_LIST "dynamically-allocated list"
#define MTAG_ENV "environment variable"
#define MTAG_SCANRES "scan-results list"
/*
Functions defined in nomos.c, used in other files
*/
void Bail(int exitval);
int optionIsSet(int val);
/*
Global Declarations
*/
extern struct globals gl;
extern struct curScan cur;
extern licText_t licText[];
extern licSpec_t licSpec[];
extern int schedulerMode; /* Non-zero if being run by scheduler */
/**
Declarations for using the memory debug stuff
*/
#ifdef MEMORY_TRACING
char *memAllocTagged();
void memFreeTagged();
#define memFree(x,y) memFreeTagged(x, y)
#define memAlloc(x,y) memAllocTagged(x, y)
#else /* NOT MEMORY_TRACING */
#define memFree(x,/*notused*/y) free(x)
#define memAlloc(x,y) calloc(x, 1)
#endif /* NOT MEMORY_TRACING */
/*
* Macros for timing - refer to findPhrase() for usage examples
*/
/* need TIMING_DECL in the declarations section of function */
#define DECL_TIMER struct timeval bTV, eTV; float proctime
#define ZERO_TIMER memcpy((void *) &bTV, (void *) &eTV, sizeof(eTV))
#define RESET_TIMER END_TIMER; ZERO_TIMER
#define START_TIMER RECORD_TIMER(bTV)
#define END_TIMER RECORD_TIMER(eTV) ; \
proctime = (float) (eTV.tv_sec - bTV.tv_sec) + \
((float) (eTV.tv_usec - bTV.tv_usec) * 0.000001)
#define RECORD_TIMER(x) (void) gettimeofday(&x, (struct timezone *) NULL)
#define PRINT_TIMER(x,y) printf("%11.6f seconds: %s\n", proctime, x); \
if (y) { DUMP_TIMERS; }
#define DUMP_TIMERS printf("[1]: %d.%06d\n", bTV.tv_sec, bTV.tv_usec); \
printf("[2]: %d.%06d\n", eTV.tv_sec, eTV.tv_usec)
/*
* Cut-and-paste this stuff to turn on timing
*/
#if 0
#ifdef TIMING
DECL_TIMER; /* timer declaration */
#endif
/* */
#ifdef TIMING
START_TIMER; /* turn on the timer */
#endif /* TIMING */
/* */
#ifdef TIMING
END_TIMER; /* stop the timer */
PRINT_TIMER("unpack", 0); /* ... and report */
START_TIMER; /* optionally re-start timer */
#endif /* TIMING */
#endif
#endif /* _NOMOS_H */