#!/usr/bin/env perl ####################################################################### # utf82rts.pl: converts UTF-8 text on stdout to RTS text on stdin. # # RTS is a transliteration standard for Telugu. # See http://www.bhaavana.net/Rangavalli/webeditor.html#rts # for more details. # # UTF-8 is a popular encoding scheme for unicode text. # See http://www.cl.cam.ac.uk/~mgk25/unicode.html # for more details. # # If you wish to know more about using unicode for telugu script, # see http://groups.yahoo.com/group/racchabanda/message/7202 # # Pre-requisites: # This script has been tested with stock perl 5.8.0 on # stock linux 2.4 and stock cygwin(win2k) 1.3.22. See # 'perldoc perluniintro' and 'perldoc perlunicode' for # more specific details on unicode support in perl. # # Author: Prasad A. Chodavarapu (http://chaitanya.bhaavana.net/) # Version: $Revision: 1.3 $ $Date: 2003/08/14 20:59:44 $ # # Copyright (c) 2003, Prasad A. Chodavarapu # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA ####################################################################### use strict; use Getopt::Std; # all "positions" (*_pos) here are relative to $TELUGU_START my $TELUGU_START = 0xC00; #first char in telugu charset my $IMPLICIT_VOWEL_RTS = 'a'; my $DRTAM_POS = 77; #dRtaM is a stand-alone pollu as in "jEsen" my $DRTAM_RTS = 'n'; my $SUNNA_POS = 2; # rts equivalent at each "position" my @rts = ( '', #0 unused in unicode '@m', #1 chandrabindu - arasunna? 'M', #2 sunna '@h', #3 visarga '', #4 unused in unicode 'a', #5 'aa', #6 'i', #7 'ee', #8 'u', #9 'oo', #10 'R', #11 '~l', #12 '', #13 unused in unicode 'e', #14 'ae', #15 'ai', #16 '', #17 'o', #18 'O', #19 'au', #20 'k', #21 'kh', #22 'g', #23 'gh', #24 '~m', #25 'ch', #26 'Ch', #27 'j', #28 'jh', #29 '~n', #30 'T', #31 'Th', #32 'D', #33 'Dh', #34 'N', #35 't', #36 'th', #37 'd', #38 'dh', #39 'n', #40 '', #41 - unused in unicode 'p', #42 'ph', #43 'b', #44 'bh', #45 'm', #46 'y', #47 'r', #48 '~r', #49 'l', #50 'L', #51 '', #52 - unused in unicode 'v', #53 'S', #54 'sh', #55 's', #56 'h', #57 '', #58 - unused in unicode '', #59 - unused in unicode '', #60 - unused in unicode '', #61 - unused in unicode 'aa', #62 'i', #63 'ee', #64 'u', #65 'oo', #66 'R', #67 'Ru', #68 '', #69 - unused in unicode 'e', #70 'ae', #71 'ai', #72 '', #73 - unused in unicode 'o', #74 'O', #75, 'au', #76 '', #77 - pollu implicit in rts '', #78 - unused in unicode '', #79 - unused in unicode '', #80 - unused in unicode '', #81 - unused in unicode '', #82 - unused in unicode '', #83 - unused in unicode '', #84 - unused in unicode '', #85 - donno what "length mark" means 'ai', #86 '', #87 - unused in unicode '', #88 - unused in unicode '', #89 - unused in unicode '', #90 - unused in unicode '', #91 - unused in unicode '', #92 - unused in unicode '', #93 - unused in unicode '', #94 - unused in unicode '', #95 - unused in unicode 'Ru', #96, '~L', #97, '', #98 - unused in unicode '', #99 - unused in unicode '', #100 - unused in unicode '', #101 - unused in unicode '0', #102 '1', #103 '2', #104 '3', #105 '4', #106 '5', #107 '6', #108 '7', #109 '8', #110 '9', #111 ); my $maxPos = $#rts; # returns true if the given position is that of a consonant sub isConsonant { my $pos = shift; return ((21 <= $pos) and ($pos <= 57)); } # returns true if the given position is that of a vowel sign sub isVowelModifier { my $pos = shift; return ((62 <= $pos) and ($pos <= 77)); } # prints usage sub usage { print < rts.txt Options: -h: print usage -s : demarkate RTS boundaries with given separator. use -s '' to turn off demarkation. default separator is '\#' USAGE } # returns true if given char is punctuation or white space sub isPunctuation { my $char = shift; my $absPos = ord($char); return (($absPos <= 64) or (($absPos >= 91) and ($absPos <= 96)) or ($absPos >= 123)); } ############ start of main flow ################ # option processing my %opt; getopts('hs:',\%opt); if (exists($opt{h})) { usage(); exit 0;} my $sep = '#'; if (exists($opt{s})) { $sep = $opt{s}; } # enable utf8 wrappers for stdin and stdout binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); # parser iterates over each char and applies the conversion. # parser state merely consists of whether or not a # vowel modifier determination is pending for the # previous consonant. in case RTS demarkation is turned on, # we've one more state variable to check if we are in rts. # # Here's an example: "baagunnaaraa!" # is represented in unicode as # ba,aa,ga,u,na,pollu,na,aa,ra,aa,! # As you can see, every consonnat comes with an implict # vowel (a) that can be overridden/suppressed by a subsequent # vowel modifier/pollu respectively. # my $vowelPending = 0; my $inRts = 0; while(<>) { foreach my $char (split //) { my $pos = ord($char)- $TELUGU_START; # take care of unfinished business first! # complete conversion of previous consonant. if ($vowelPending) { if (isVowelModifier($pos)) { print $rts[$pos]; $vowelPending = 0; next; } else { print $IMPLICIT_VOWEL_RTS; $vowelPending = 0; } } # is it a telugu char? if not, just echo. if (($pos < 0) or ($pos > $maxPos)) { # print separator first if we were inRts # but don't do it for intermittent white space if ($inRts and $sep and !isPunctuation($char)) { print $sep; $inRts = 0; } print $char; next; } # print separator first if we r getting into RTS afresh unless ($inRts) { print $sep; $inRts = 1; } # replace with rts equivalent. print $rts[$pos]; # as unicode uses the same position for both pollu and dRtaM, # and as we've set an empty replacement to take care of the pollu case, # we need to tackle the dRtaM case specially. if ($pos == $DRTAM_POS) { print $DRTAM_RTS; } # mark state if we have unfinished business. if (isConsonant($pos)) { $vowelPending = 1; } } } if ($inRts and $sep) { print $sep; $inRts = 0;}