overlap.pl 1.44 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

@ARGV == 3
    || die "ERR: must specify 3 input files\n";

($stopFile,$file1,$file2) = @ARGV;

readStoplist($stopFile);
%counts1 = countTokens($file1);
%counts2 = countTokens($file2);

foreach $w (keys %counts1) {
    $allTokens{$w}=1;
}
foreach $w (keys %counts2) {
    $allTokens{$w}=1;
}

$inBothCount=0;
foreach $w (keys %allTokens) {
    if ($counts1{$w} * $counts2{$w} > 0) {
	$inBothCount++;
    }
}

if (0 < (keys %allTokens)) {
    printf "Similarity: %.2f\n", $inBothCount/(keys %allTokens);
} else {
    print "Similarity: 0\n";
}


sub readStoplist {
    my ($file) = @_;
    open(STOP,"< $file") 
	|| die "ERR: problem opening file ($file)\n";
    while (<STOP>) {
	chop;
	$stops{$_} = 1;
    }
    close STOP;
}

sub isaStopword {
    return 0;
    my ($w) = @_;
    if ($stops{$w} > 0) {
	return 1;
    } else {
	return 0;
    } 
}

sub getCount { 
    my ($wd,$countsRef) = @_;
    my $count;
    if (defined ($count = $$countsRef{$w})) {
	return $count;
    } else {
	return 0;
    }
}

sub countTokens {
    my ($file) = @_;
    my ($w,@wds,%counts);
    open(IN,"<$file")
	|| die "ERROR: problem opening file ($file)\n";
    while (<IN>) {
	@wds = tokenise($_);
	foreach $w (@wds) {
	    unless (isaStopword($w)) {
		$counts{$w}++;
	    }
	}
    }
    return %counts;
}

sub tokenise {
    return split("[^a-z]+",(lc $_[0]));
}

sub tokenise {
    my ($line) = @_;
    $line =~ tr/A-Z/a-z/;
    $line =~ s/[^a-z]/ /g;
    return split(' ',$line);
}