parent
b9f6504a02
commit
88163855bd
@ -1,3 +1,7 @@
|
||||
LD_LIBRARY_PATH=/home/hai/sledge/sledge/runtime/bin
|
||||
SLEDGE_SCHEDULER=MDL
|
||||
SLEDGE_SCHEDULER=EDF
|
||||
SLEDGE_SANDBOX_PERF_LOG=/home/hai/sledge/sledge/runtime/tests/runtime_sandbox_perf_log.log
|
||||
SLEDGE_CPU_SPEED=2500
|
||||
SLEDGE_SCHEDULER=SRSF
|
||||
SLEDGE_SIGALRM_HANDLER=TRIAGED
|
||||
SLEDGE_NWORKERS=5
|
||||
|
@ -1 +0,0 @@
|
||||
Subproject commit 15edd745735469de6a949766fbdeb73fd298baf6
|
@ -0,0 +1,17 @@
|
||||
{
|
||||
"active": true,
|
||||
"name": "test_demo",
|
||||
"path": "hello_ps_wasm.so",
|
||||
"port": 10000,
|
||||
"relative-deadline-us": 180000,
|
||||
"argsize": 1,
|
||||
"priority": 1,
|
||||
"pre_module_count": 0,
|
||||
"next_modules": [],
|
||||
"http-req-headers": [],
|
||||
"http-req-content-type": "audio/raw",
|
||||
"http-req-size": 1024000,
|
||||
"http-resp-headers": [],
|
||||
"http-resp-size": 102400,
|
||||
"http-resp-content-type": "text/plain"
|
||||
}
|
@ -0,0 +1,209 @@
|
||||
// Simple tool to extract unigram counts
|
||||
// Jeffrey Pennington (jpennin@stanford.edu)
|
||||
// From http://nlp.stanford.edu/projects/glove/ covered under APACHE LICENSE, a copy of which follows
|
||||
//
|
||||
// This software is licensed under the Apache 2 license, quoted below.
|
||||
//
|
||||
// Copyright 2014 Jeffrey Pennington <jpennin@stanford.edu>
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy of
|
||||
// the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
// License for the specific language governing permissions and limitations under
|
||||
// the License.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_STRING_LENGTH 1000
|
||||
#define TSIZE 1048576
|
||||
#define SEED 1159241
|
||||
#define HASHFN bitwisehash
|
||||
|
||||
typedef struct vocabulary {
|
||||
char *word;
|
||||
long long count;
|
||||
} VOCAB;
|
||||
|
||||
typedef struct hashrec {
|
||||
char *word;
|
||||
long long count;
|
||||
struct hashrec *next;
|
||||
} HASHREC;
|
||||
|
||||
int verbose = 2; // 0, 1, or 2
|
||||
long long min_count = 1; // min occurrences for inclusion in vocab
|
||||
long long max_vocab = 0; // max_vocab = 0 for no limit
|
||||
|
||||
|
||||
/* Efficient string comparison */
|
||||
int scmp( char *s1, char *s2 ) {
|
||||
while(*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
|
||||
return(*s1 - *s2);
|
||||
}
|
||||
|
||||
|
||||
/* Vocab frequency comparison; break ties alphabetically */
|
||||
int CompareVocabTie(const void *a, const void *b) {
|
||||
long long c;
|
||||
if( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
|
||||
else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));
|
||||
|
||||
}
|
||||
|
||||
/* Vocab frequency comparison; no tie-breaker */
|
||||
int CompareVocab(const void *a, const void *b) {
|
||||
long long c;
|
||||
if( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
|
||||
else return 0;
|
||||
}
|
||||
|
||||
/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */
|
||||
|
||||
/* Simple bitwise hash function */
|
||||
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
|
||||
char c;
|
||||
unsigned int h;
|
||||
h = seed;
|
||||
for(; (c =* word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
|
||||
return((unsigned int)((h&0x7fffffff) % tsize));
|
||||
}
|
||||
|
||||
/* Create hash table, initialise pointers to NULL */
|
||||
HASHREC ** inithashtable() {
|
||||
int i;
|
||||
HASHREC **ht;
|
||||
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
|
||||
for(i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
|
||||
return(ht);
|
||||
}
|
||||
|
||||
/* Search hash table for given string, insert if not found */
|
||||
void hashinsert(HASHREC **ht, char *w) {
|
||||
HASHREC *htmp, *hprv;
|
||||
unsigned int hval = HASHFN(w, TSIZE, SEED);
|
||||
|
||||
for(hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
|
||||
if(htmp == NULL) {
|
||||
htmp = (HASHREC *) malloc( sizeof(HASHREC) );
|
||||
htmp->word = (char *) malloc( strlen(w) + 1 );
|
||||
strcpy(htmp->word, w);
|
||||
htmp->count = 1;
|
||||
htmp->next = NULL;
|
||||
if( hprv==NULL )
|
||||
ht[hval] = htmp;
|
||||
else
|
||||
hprv->next = htmp;
|
||||
}
|
||||
else {
|
||||
/* new records are not moved to front */
|
||||
htmp->count++;
|
||||
if(hprv != NULL) {
|
||||
/* move to front on access */
|
||||
hprv->next = htmp->next;
|
||||
htmp->next = ht[hval];
|
||||
ht[hval] = htmp;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int get_counts() {
|
||||
long long i = 0, j = 0, vocab_size = 12500;
|
||||
char format[20];
|
||||
char str[MAX_STRING_LENGTH + 1];
|
||||
HASHREC **vocab_hash = inithashtable();
|
||||
HASHREC *htmp;
|
||||
VOCAB *vocab;
|
||||
FILE *fid = stdin;
|
||||
|
||||
fprintf(stderr, "BUILDING VOCABULARY\n");
|
||||
if(verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
|
||||
sprintf(format,"%%%ds",MAX_STRING_LENGTH);
|
||||
while(fscanf(fid, format, str) != EOF) { // Insert all tokens into hashtable
|
||||
hashinsert(vocab_hash, str);
|
||||
if(((++i)%100000) == 0) if(verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
|
||||
}
|
||||
if(verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
|
||||
vocab = malloc(sizeof(VOCAB) * vocab_size);
|
||||
for(i = 0; i < TSIZE; i++) { // Migrate vocab to array
|
||||
htmp = vocab_hash[i];
|
||||
while (htmp != NULL) {
|
||||
vocab[j].word = htmp->word;
|
||||
vocab[j].count = htmp->count;
|
||||
j++;
|
||||
if(j>=vocab_size) {
|
||||
vocab_size += 2500;
|
||||
vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);
|
||||
}
|
||||
htmp = htmp->next;
|
||||
}
|
||||
}
|
||||
if(verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
|
||||
if(max_vocab > 0 && max_vocab < j)
|
||||
// If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks.
|
||||
// This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet
|
||||
qsort(vocab, j, sizeof(VOCAB), CompareVocab);
|
||||
else max_vocab = j;
|
||||
qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
|
||||
|
||||
for(i = 0; i < max_vocab; i++) {
|
||||
if(vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
|
||||
if(verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
|
||||
break;
|
||||
}
|
||||
printf("%s %lld\n",vocab[i].word,vocab[i].count);
|
||||
}
|
||||
|
||||
if(i == max_vocab && max_vocab < j) if(verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
|
||||
fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int find_arg(char *str, int argc, char **argv) {
|
||||
int i;
|
||||
for (i = 1; i < argc; i++) {
|
||||
if(!scmp(str, argv[i])) {
|
||||
if (i == argc - 1) {
|
||||
printf("No argument given for %s\n", str);
|
||||
exit(1);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
// if (argc == 1) {
|
||||
// printf("Simple tool to extract unigram counts\n");
|
||||
// printf("Author: Jeffrey Pennington (jpennin@stanford.edu)\n\n");
|
||||
// printf("Usage options:\n");
|
||||
// printf("\t-verbose <int>\n");
|
||||
// printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
|
||||
// printf("\t-max-vocab <int>\n");
|
||||
// printf("\t\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n");
|
||||
// printf("\t-min-count <int>\n");
|
||||
// printf("\t\tLower limit such that words which occur fewer than <int> times are discarded.\n");
|
||||
// printf("\nExample usage:\n");
|
||||
// printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n");
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
// if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
|
||||
// if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);
|
||||
// if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]);
|
||||
verbose = 2;
|
||||
max_vocab = 100000;
|
||||
min_count = 2;
|
||||
return get_counts();
|
||||
}
|
||||
|
Loading…
Reference in new issue