in data/ami/ami_split_segments.pl [45:92]
sub split_on_comma {
my ($text, $comma_times, $btime, $etime, $max_words_per_seg)= @_;
my %comma_hash = %$comma_times;
print "Btime, Etime : $btime, $etime\n";
my $stime = ($etime+$btime)/2; #split time
my $skey = "";
my $otime = $btime;
foreach my $k (sort {$comma_hash{$a} cmp $comma_hash{$b} } keys %comma_hash) {
print "Key : $k : $comma_hash{$k}\n";
my $ktime = $comma_hash{$k};
if ($ktime==$btime) { next; }
if ($ktime==$etime) { last; }
if (abs($stime-$ktime)/2<abs($stime-$otime)/2) {
$otime = $ktime;
$skey = $k;
}
}
my %transcripts = ();
if (!($skey =~ /[\,][0-9]+/)) {
print "Cannot split into less than $max_words_per_seg words! Leaving : $text\n";
$transcripts{get_name($btime, $etime)}=$text;
return %transcripts;
}
print "Splitting $text on $skey at time $otime (stime is $stime)\n";
my @utts1 = split(/$skey\s+/, $text);
for (my $i=0; $i<=$#utts1; $i++) {
my $st = $btime;
my $et = $comma_hash{$skey};
if ($i>0) {
$st=$comma_hash{$skey};
$et = $etime;
}
my (@utts) = split (' ', $utts1[$i]);
if ($#utts < $max_words_per_seg) {
my $nm = get_name($st, $et);
print "SplittedOnComma[$i]: $nm : $utts1[$i]\n";
$transcripts{$nm} = $utts1[$i];
} else {
print 'Continue splitting!';
my %transcripts2 = split_on_comma($utts1[$i], \%comma_hash, $st, $et, $max_words_per_seg);
%transcripts = merge_hashes(\%transcripts, \%transcripts2);
}