| 52 | | # Source WordNet files |
| 53 | | IndexFiles = %w[ index.noun index.verb index.adj index.adv ] |
| 54 | | MorphFiles = { |
| 55 | | 'adj.exc' => WordNet::Adjective, |
| 56 | | 'adv.exc' => WordNet::Adverb, |
| 57 | | 'noun.exc' => WordNet::Noun, |
| 58 | | 'verb.exc' => WordNet::Verb, |
| 59 | | 'cousin.exc' => '', |
| 60 | | } |
| 61 | | DataFiles = { |
| 62 | | 'data.adj' => WordNet::Adjective, |
| 63 | | 'data.adv' => WordNet::Adverb, |
| 64 | | 'data.noun' => WordNet::Noun, |
| 65 | | 'data.verb' => WordNet::Verb, |
| 66 | | } |
| 67 | | |
| 68 | | # Struct which represents a list of files, a database, and a processor function |
| 69 | | # for moving records from each of the files into the database. |
| 70 | | Fileset = Struct::new( "WordNetFileset", :files, :name, :db, :processor ) |
| 71 | | |
| 72 | | # How many records to insert between commits |
| 73 | | CommitThreshold = 2000 |
| 74 | | |
| 75 | | # Temporary location for the lexicon data files |
| 76 | | BuildDir = Pathname.new( __FILE__ ).expand_path.dirname + |
| 77 | | Pathname.new( WordNet::Lexicon::DEFAULT_DB_ENV ).basename |
| 78 | | |
| 79 | | |
| 80 | | |
| 81 | | ##################################################################### |
| 82 | | ### M A I N P R O G R A M |
| 83 | | ##################################################################### |
| 84 | | def convertdb( errorLimit=0 ) |
| 85 | | $stderr.sync = $stdout.sync = true |
| 86 | | header "WordNet Lexicon Converter" |
| 87 | | |
| 88 | | # Make sure the user knows what they're in for |
| 89 | | message "This program will convert WordNet data files into databases\n"\ |
| 90 | | "used by Ruby-WordNet. This will not affect existing WordNet files,\n"\ |
| 91 | | "but will require up to 40Mb of disk space.\n" |
| 92 | | exit unless /^y/i =~ prompt_with_default("Continue?", "y") |
| 93 | | |
| 94 | | # Open the database and check to be sure it's empty. Confirm overwrite if |
| 95 | | # not. Checkpoint and set up logging proc if debugging. |
| 96 | | if BuildDir.exist? |
| 97 | | message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\ |
| 98 | | "will be overwritten.\n" |
| 99 | | abort( "user cancelled." ) unless |
| 100 | | /^y/i =~ prompt_with_default( "Continue?", "n" ) |
| 101 | | BuildDir.rmtree |
| | 52 | class WordNetConverter |
| | 53 | |
| | 54 | # Source WordNet files |
| | 55 | IndexFiles = %w[ index.noun index.verb index.adj index.adv ] |
| | 56 | MorphFiles = { |
| | 57 | 'adj.exc' => WordNet::Adjective, |
| | 58 | 'adv.exc' => WordNet::Adverb, |
| | 59 | 'noun.exc' => WordNet::Noun, |
| | 60 | 'verb.exc' => WordNet::Verb, |
| | 61 | 'cousin.exc' => '', |
| | 62 | } |
| | 63 | DataFiles = { |
| | 64 | 'data.adj' => WordNet::Adjective, |
| | 65 | 'data.adv' => WordNet::Adverb, |
| | 66 | 'data.noun' => WordNet::Noun, |
| | 67 | 'data.verb' => WordNet::Verb, |
| | 68 | } |
| | 69 | |
| | 70 | # Struct which represents a list of files, a database, and a processor function |
| | 71 | # for moving records from each of the files into the database. |
| | 72 | Fileset = Struct::new( "WordNetFileset", :files, :name, :db, :processor ) |
| | 73 | |
| | 74 | # How many records to insert between commits |
| | 75 | CommitThreshold = 2000 |
| | 76 | |
| | 77 | # Temporary location for the lexicon data files |
| | 78 | BuildDir = Pathname.new( __FILE__ ).expand_path.dirname + |
| | 79 | Pathname.new( WordNet::Lexicon::DEFAULT_DB_ENV ).basename |
| | 80 | |
| | 81 | |
| | 82 | ### Create a new converter that will dump WordNet dictionary files into a BerkeleyDB |
| | 83 | ### in the given +builddir+ |
| | 84 | def initialize( builddir=BuildDir ) |
| | 85 | @builddir = Pathname.new( builddir ) |
| 103 | | |
| 104 | | # Find the source data files |
| 105 | | if ARGV.empty? |
| | 87 | |
| | 88 | |
| | 89 | ### Convert the various dict files from the WordNet project into a BerkeleyDB database |
| | 90 | def convertdb( errorLimit=0 ) |
| | 91 | $stderr.sync = $stdout.sync = true |
| | 92 | header "WordNet Lexicon Converter" |
| | 93 | |
| | 94 | # Make sure the user knows what they're in for |
| | 95 | message "This program will convert WordNet data files into databases\n"\ |
| | 96 | "used by Ruby-WordNet. This will not affect existing WordNet files,\n"\ |
| | 97 | "but will require up to 40Mb of disk space.\n" |
| | 98 | exit unless /^y/i =~ prompt_with_default("Continue?", "y") |
| | 99 | |
| | 100 | # Open the database and check to be sure it's empty. Confirm overwrite if |
| | 101 | # not. Checkpoint and set up logging proc if debugging. |
| | 102 | if @builddir.exist? && @builddir.entries.include?( 'data' ) |
| | 103 | message ">>> Warning: Existing data in the Ruby-WordNet databases\n"\ |
| | 104 | "will be overwritten.\n" |
| | 105 | abort( "user cancelled." ) unless |
| | 106 | /^y/i =~ prompt_with_default( "Continue?", "n" ) |
| | 107 | @builddir.rmtree |
| | 108 | end |
| | 109 | |
| | 110 | # Find the source data files |
| 116 | | datadir = prompt_with_default( "Data directory", default + "dict" ) |
| 117 | | else |
| 118 | | datadir = ARGV.shift |
| 119 | | end |
| 120 | | datadir = Pathname.new( datadir ) |
| 121 | | |
| 122 | | abort( "Directory '#{datadir}' does not exist" ) unless datadir.exist? |
| 123 | | abort( "'#{datadir}' is not a directory" ) unless datadir.directory? |
| 124 | | testfile = datadir + "data.noun" |
| 125 | | abort( "'#{datadir}' doesn't seem to contain the necessary files.") unless testfile.exist? |
| 126 | | |
| 127 | | # Open the lexicon readwrite into the temporary datadir |
| 128 | | BuildDir.mkpath |
| 129 | | lexicon = WordNet::Lexicon::new( BuildDir.to_s, 0666 ) |
| 130 | | |
| 131 | | # Process each fileset |
| 132 | | [ # Fileset, name, database handle, processor |
| 133 | | Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse_index_line) ), |
| 134 | | Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse_morph_line) ), |
| 135 | | Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse_synset_line) ), |
| 136 | | ].each do |set| |
| 137 | | message "Converting %s files...\n" % set.name |
| 138 | | set.db.truncate |
| 139 | | |
| 140 | | # Process each file in the set with the appropriate processor method and |
| 141 | | # insert results into the corresponding table. |
| 142 | | set.files.each do |file,pos| |
| 143 | | message " #{file}..." |
| 144 | | |
| 145 | | filepath = File::join( datadir, file ) |
| 146 | | if !File::exists?( filepath ) |
| 147 | | message "missing: skipped\n" |
| 148 | | next |
| 149 | | end |
| 150 | | |
| 151 | | txn, dbh = lexicon.env.txn_begin( 0, set.db ) |
| 152 | | entries = lineNumber = errors = 0 |
| 153 | | File::readlines( filepath ).each do |line| |
| 154 | | lineNumber += 1 |
| 155 | | next if /^\s/ =~ line |
| 156 | | |
| 157 | | key, value = set.processor.call( line.chomp, lineNumber, pos ) |
| 158 | | unless key |
| 159 | | errors += 1 |
| 160 | | if errorLimit.nonzero? && errors >= errorLimit |
| 161 | | abort( "Too many errors" ) |
| | 121 | datadir = prompt_with_default( "Data directory", default + "/dict" ) |
| | 122 | datadir = Pathname.new( datadir ) |
| | 123 | |
| | 124 | abort( "Directory '#{datadir}' does not exist" ) unless datadir.exist? |
| | 125 | abort( "'#{datadir}' is not a directory" ) unless datadir.directory? |
| | 126 | testfile = datadir + "data.noun" |
| | 127 | abort( "'#{datadir}' doesn't seem to contain the necessary files.") unless testfile.exist? |
| | 128 | |
| | 129 | # Open the lexicon readwrite into the temporary datadir |
| | 130 | @builddir.mkpath |
| | 131 | lexicon = WordNet::Lexicon::new( @builddir.to_s, 0666 ) |
| | 132 | |
| | 133 | # Process each fileset |
| | 134 | [ # Fileset, name, database handle, processor |
| | 135 | Fileset::new( IndexFiles, "index", lexicon.index_db, method(:parse_index_line) ), |
| | 136 | Fileset::new( MorphFiles, "morph", lexicon.morph_db, method(:parse_morph_line) ), |
| | 137 | Fileset::new( DataFiles, "data", lexicon.data_db, method(:parse_synset_line) ), |
| | 138 | ].each do |set| |
| | 139 | message "Converting %s files...\n" % set.name |
| | 140 | set.db.truncate |
| | 141 | |
| | 142 | # Process each file in the set with the appropriate processor method and |
| | 143 | # insert results into the corresponding table. |
| | 144 | set.files.each do |file,pos| |
| | 145 | message " #{file}..." |
| | 146 | |
| | 147 | filepath = File::join( datadir, file ) |
| | 148 | if !File::exists?( filepath ) |
| | 149 | message "missing: skipped\n" |
| | 150 | next |
| | 151 | end |
| | 152 | |
| | 153 | txn, dbh = lexicon.env.txn_begin( 0, set.db ) |
| | 154 | entries = lineNumber = errors = 0 |
| | 155 | File::readlines( filepath ).each do |line| |
| | 156 | lineNumber += 1 |
| | 157 | next if /^\s/ =~ line |
| | 158 | |
| | 159 | key, value = set.processor.call( line.chomp, lineNumber, pos ) |
| | 160 | unless key |
| | 161 | errors += 1 |
| | 162 | if errorLimit.nonzero? && errors >= errorLimit |
| | 163 | abort( "Too many errors" ) |
| | 164 | end |
| | 165 | end |
| | 166 | |
| | 167 | dbh[ key ] = value |
| | 168 | entries += 1 |
| | 169 | print "%d%s" % [ entries, "\x08" * entries.to_s.length ] |
| | 170 | |
| | 171 | # Commit and start a new transaction every 1000 records |
| | 172 | if (entries % CommitThreshold).zero? |
| | 173 | print "." |
| | 174 | txn.commit( BDB::TXN_NOSYNC ) |
| | 175 | txn, dbh = lexicon.env.txn_begin( 0, set.db ) |
| 164 | | |
| 165 | | dbh[ key ] = value |
| 166 | | entries += 1 |
| 167 | | print "%d%s" % [ entries, "\x08" * entries.to_s.length ] |
| 168 | | |
| 169 | | # Commit and start a new transaction every 1000 records |
| 170 | | if (entries % CommitThreshold).zero? |
| 171 | | print "." |
| 172 | | txn.commit( BDB::TXN_NOSYNC ) |
| 173 | | txn, dbh = lexicon.env.txn_begin( 0, set.db ) |
| | 178 | |
| | 179 | message "committing..." |
| | 180 | txn.commit( BDB::TXN_SYNC ) |
| | 181 | message "done (%d entries, %d errors).\n" % |
| | 182 | [ entries, errors ] |
| | 183 | end |
| | 184 | |
| | 185 | lock_stats = lexicon.env.lock_stat |
| | 186 | message "Lock statistics:\n" |
| | 187 | puts " Lock objects: #{lock_stats['st_nobjects']}/#{lock_stats['st_maxnobjects']}", |
| | 188 | " Locks: #{lock_stats['st_nlocks']}/#{lock_stats['st_maxnlocks']}", |
| | 189 | " Lockers: #{lock_stats['st_nlockers']}/#{lock_stats['st_maxnlockers']}" |
| | 190 | |
| | 191 | |
| | 192 | message "Checkpointing DB and cleaning logs..." |
| | 193 | lexicon.checkpoint |
| | 194 | lexicon.clean_logs |
| | 195 | puts "done." |
| | 196 | end |
| | 197 | |
| | 198 | message "done.\n\n" |
| | 199 | end |
| | 200 | |
| | 201 | |
| | 202 | ####### |
| | 203 | private |
| | 204 | ####### |
| | 205 | |
| | 206 | # Index entry patterns |
| | 207 | IndexEntry = /^(\S+)\s(\w)\s(\d+)\s(\d+)\s/ |
| | 208 | PointerSymbol = /(\S{1,2})\s/ |
| | 209 | SenseCounts = /(\d+)\s(\d+)\s/ |
| | 210 | SynsetId = /(\d{8})\s*/ |
| | 211 | |
| | 212 | ### Parse an entry from one of the index files and return the key and |
| | 213 | ### data. Returns +nil+ if any part of the netry isn't able to be parsed. The |
| | 214 | ### +pos+ argument is not used -- it's just to make the interface between all |
| | 215 | ### three processor methods the same. |
| | 216 | def parse_index_line( string, lineNumber, pos=nil ) |
| | 217 | $scanner.string = string |
| | 218 | synsets = [] |
| | 219 | lemma, pos, polycnt = nil, nil, nil |
| | 220 | |
| | 221 | raise "whole error" unless $scanner.scan( IndexEntry ) |
| | 222 | lemma, pos, polycnt, pcnt = $scanner[1], $scanner[2], $scanner[3], $scanner[4] |
| | 223 | |
| | 224 | # Discard pointer symbols |
| | 225 | pcnt.to_i.times do |i| |
| | 226 | $scanner.skip( PointerSymbol ) or raise "couldn't skip pointer #{i}" |
| | 227 | end |
| | 228 | |
| | 229 | # Parse sense and tagsense counts |
| | 230 | $scanner.scan( SenseCounts ) or raise "couldn't parse sense counts" |
| | 231 | senseCount, tagSenseCount = $scanner[1], $scanner[2] |
| | 232 | |
| | 233 | # Find synsets |
| | 234 | senseCount.to_i.times do |i| |
| | 235 | $scanner.scan( SynsetId ) or raise "couldn't parse synset #{i}" |
| | 236 | synset = $scanner[1] |
| | 237 | synsets.push( synset ) |
| | 238 | $senseIndex[ synset + "%" + pos + "%" + lemma ] = i.to_s |
| | 239 | end |
| | 240 | |
| | 241 | # Make the index entry and return it |
| | 242 | key = lemma + "%" + pos |
| | 243 | data = synsets.join(WordNet::SUB_DELIM) |
| | 244 | |
| | 245 | return key, data |
| | 246 | rescue => err |
| | 247 | message "Index entry did not parse: %s at '%s...' (line %d)\n\t%s\n" % [ |
| | 248 | err.message, |
| | 249 | $scanner.rest[0,20], |
| | 250 | lineNumber, |
| | 251 | err.backtrace[0] |
| | 252 | ] |
| | 253 | return nil |
| | 254 | end |
| | 255 | |
| | 256 | |
| | 257 | ### "Parse" a morph line and return it as a key and value. |
| | 258 | def parse_morph_line( string, lineNumber, pos ) |
| | 259 | key, value = string.split |
| | 260 | return "#{key}%#{pos}", value |
| | 261 | rescue => err |
| | 262 | message "Morph entry did not parse: %s for %s (pos = %s, line %d)\n\t%s\n" % [ |
| | 263 | err.message, |
| | 264 | string.inspect, |
| | 265 | pos.inspect, |
| | 266 | lineNumber, |
| | 267 | err.backtrace[0] |
| | 268 | ] |
| | 269 | return nil |
| | 270 | end |
| | 271 | |
| | 272 | |
| | 273 | # Synset data patterns |
| | 274 | Synset = /(\d+)\s(\d{2})\s(\w)\s(\w{2})\s/ |
| | 275 | SynWord = /(\S+)\s(\w)*\s*/ |
| | 276 | SynPtrCnt = /(\d{3})\s/ |
| | 277 | SynPtr = /(\S{1,2})\s(\d+)\s(\w)\s(\w{4})\s/ |
| | 278 | SynFrameCnt = /\s*(\d{2})\s/ |
| | 279 | SynFrame = /\+\s(\d{2})\s(\w{2})\s/ |
| | 280 | SynGloss = /\s*\|\s*(.+)?/ |
| | 281 | |
| | 282 | ### Parse an entry from a data file and return the key and data. Returns +nil+ |
| | 283 | ### if any part of the entry isn't able to be parsed. |
| | 284 | def parse_synset_line( string, lineNumber, pos ) |
| | 285 | $scanner.string = string |
| | 286 | |
| | 287 | filenum, synsetType, gloss = nil, nil, nil |
| | 288 | words = [] |
| | 289 | ptrs = [] |
| | 290 | frames = [] |
| | 291 | |
| | 292 | # Parse the first part of the synset |
| | 293 | $scanner.scan( Synset ) or raise "unable to parse synset" |
| | 294 | offset, filenum, synsetType, wordCount = |
| | 295 | $scanner[1], $scanner[2], $scanner[3], $scanner[4] |
| | 296 | |
| | 297 | # Parse the words |
| | 298 | wordCount.to_i(16).times do |i| |
| | 299 | $scanner.scan( SynWord ) or raise "unable to parse word #{i}" |
| | 300 | word, lexid = $scanner[1], $scanner[2] |
| | 301 | senseKey = (offset + "%" + pos + "%" + word).downcase |
| | 302 | if !$senseIndex.key?( senseKey ) |
| | 303 | newKey = senseKey.sub( /\(\w+\)$/, '' ) |
| | 304 | if !$senseIndex.key?( newKey ) |
| | 305 | raise "Sense index does not contain sense '#{senseKey}' "\ |
| | 306 | "(tried #{newKey}, too)." |
| 176 | | |
| 177 | | message "committing..." |
| 178 | | txn.commit( BDB::TXN_SYNC ) |
| 179 | | message "done (%d entries, %d errors).\n" % |
| 180 | | [ entries, errors ] |
| 181 | | end |
| 182 | | |
| 183 | | lock_stats = lexicon.env.lock_stat |
| 184 | | message "Lock statistics:\n" |
| 185 | | puts " Lock objects: #{lock_stats['st_nobjects']}/#{lock_stats['st_maxnobjects']}", |
| 186 | | " Locks: #{lock_stats['st_nlocks']}/#{lock_stats['st_maxnlocks']}", |
| 187 | | " Lockers: #{lock_stats['st_nlockers']}/#{lock_stats['st_maxnlockers']}" |
| 188 | | |
| 189 | | |
| 190 | | message "Checkpointing DB and cleaning logs..." |
| 191 | | lexicon.checkpoint |
| 192 | | lexicon.clean_logs |
| 193 | | puts "done." |
| 194 | | end |
| 195 | | |
| 196 | | message "done.\n\n" |
| 197 | | end |
| 198 | | |
| 199 | | |
| 200 | | # Index entry patterns |
| 201 | | IndexEntry = /^(\S+)\s(\w)\s(\d+)\s(\d+)\s/ |
| 202 | | PointerSymbol = /(\S{1,2})\s/ |
| 203 | | SenseCounts = /(\d+)\s(\d+)\s/ |
| 204 | | SynsetId = /(\d{8})\s*/ |
| 205 | | |
| 206 | | ### Parse an entry from one of the index files and return the key and |
| 207 | | ### data. Returns +nil+ if any part of the netry isn't able to be parsed. The |
| 208 | | ### +pos+ argument is not used -- it's just to make the interface between all |
| 209 | | ### three processor methods the same. |
| 210 | | def parse_index_line( string, lineNumber, pos=nil ) |
| 211 | | $scanner.string = string |
| 212 | | synsets = [] |
| 213 | | lemma, pos, polycnt = nil, nil, nil |
| 214 | | |
| 215 | | raise "whole error" unless $scanner.scan( IndexEntry ) |
| 216 | | lemma, pos, polycnt, pcnt = $scanner[1], $scanner[2], $scanner[3], $scanner[4] |
| 217 | | |
| 218 | | # Discard pointer symbols |
| 219 | | pcnt.to_i.times do |i| |
| 220 | | $scanner.skip( PointerSymbol ) or raise "couldn't skip pointer #{i}" |
| 221 | | end |
| 222 | | |
| 223 | | # Parse sense and tagsense counts |
| 224 | | $scanner.scan( SenseCounts ) or raise "couldn't parse sense counts" |
| 225 | | senseCount, tagSenseCount = $scanner[1], $scanner[2] |
| 226 | | |
| 227 | | # Find synsets |
| 228 | | senseCount.to_i.times do |i| |
| 229 | | $scanner.scan( SynsetId ) or raise "couldn't parse synset #{i}" |
| 230 | | synset = $scanner[1] |
| 231 | | synsets.push( synset ) |
| 232 | | $senseIndex[ synset + "%" + pos + "%" + lemma ] = i.to_s |
| 233 | | end |
| 234 | | |
| 235 | | # Make the index entry and return it |
| 236 | | key = lemma + "%" + pos |
| 237 | | data = synsets.join(WordNet::SUB_DELIM) |
| 238 | | |
| 239 | | return key, data |
| 240 | | rescue => err |
| 241 | | message "Index entry did not parse: %s at '%s...' (line %d)\n\t%s\n" % [ |
| 242 | | err.message, |
| 243 | | $scanner.rest[0,20], |
| 244 | | lineNumber, |
| 245 | | err.backtrace[0] |
| 246 | | ] |
| 247 | | return nil |
| 248 | | end |
| 249 | | |
| 250 | | |
| 251 | | ### "Parse" a morph line and return it as a key and value. |
| 252 | | def parse_morph_line( string, lineNumber, pos ) |
| 253 | | key, value = string.split |
| 254 | | return "#{key}%#{pos}", value |
| 255 | | rescue => err |
| 256 | | message "Morph entry did not parse: %s for %s (pos = %s, line %d)\n\t%s\n" % [ |
| 257 | | err.message, |
| 258 | | string.inspect, |
| 259 | | pos.inspect, |
| 260 | | lineNumber, |
| 261 | | err.backtrace[0] |
| 262 | | ] |
| 263 | | return nil |
| 264 | | end |
| 265 | | |
| 266 | | |
| 267 | | # Synset data patterns |
| 268 | | Synset = /(\d+)\s(\d{2})\s(\w)\s(\w{2})\s/ |
| 269 | | SynWord = /(\S+)\s(\w)*\s*/ |
| 270 | | SynPtrCnt = /(\d{3})\s/ |
| 271 | | SynPtr = /(\S{1,2})\s(\d+)\s(\w)\s(\w{4})\s/ |
| 272 | | SynFrameCnt = /\s*(\d{2})\s/ |
| 273 | | SynFrame = /\+\s(\d{2})\s(\w{2})\s/ |
| 274 | | SynGloss = /\s*\|\s*(.+)?/ |
| 275 | | |
| 276 | | ### Parse an entry from a data file and return the key and data. Returns +nil+ |
| 277 | | ### if any part of the entry isn't able to be parsed. |
| 278 | | def parse_synset_line( string, lineNumber, pos ) |
| 279 | | $scanner.string = string |
| | 310 | |
| | 311 | words.push( word + "%" + $senseIndex[senseKey].to_s ) |
| | 312 | end |
| 281 | | filenum, synsetType, gloss = nil, nil, nil |
| 282 | | words = [] |
| 283 | | ptrs = [] |
| 284 | | frames = [] |
| 285 | | |
| 286 | | # Parse the first part of the synset |
| 287 | | $scanner.scan( Synset ) or raise "unable to parse synset" |
| 288 | | offset, filenum, synsetType, wordCount = |
| 289 | | $scanner[1], $scanner[2], $scanner[3], $scanner[4] |
| 290 | | |
| 291 | | # Parse the words |
| 292 | | wordCount.to_i(16).times do |i| |
| 293 | | $scanner.scan( SynWord ) or raise "unable to parse word #{i}" |
| 294 | | word, lexid = $scanner[1], $scanner[2] |
| 295 | | senseKey = (offset + "%" + pos + "%" + word).downcase |
| 296 | | if !$senseIndex.key?( senseKey ) |
| 297 | | newKey = senseKey.sub( /\(\w+\)$/, '' ) |
| 298 | | if !$senseIndex.key?( newKey ) |
| 299 | | raise "Sense index does not contain sense '#{senseKey}' "\ |
| 300 | | "(tried #{newKey}, too)." |
| 301 | | end |
| 302 | | senseKey = newKey |
| 303 | | end |
| 304 | | |
| 305 | | words.push( word + "%" + $senseIndex[senseKey].to_s ) |
| 306 | | end |
| 307 | | |
| 308 | | # Parse pointers |
| 309 | | if $scanner.scan( SynPtrCnt ) |
| 310 | | $scanner[1].to_i.times do |i| |
| 311 | | $scanner.scan( SynPtr ) or raise "unable to parse synptr #{i}" |
| 312 | | ptrs.push "%s %s%%%s %s" % [ |
| 313 | | $scanner[1], |
| 314 | | $scanner[2], |
| 315 | | $scanner[3], |
| 316 | | $scanner[4], |
| 317 | | ] |
| 318 | | end |
| 319 | | else |
| 320 | | raise "Couldn't parse pointer count" |
| 321 | | end |
| 322 | | |
| 323 | | # Parse frames if this synset is a verb |
| 324 | | if synsetType == WordNet::Verb |
| 325 | | if $scanner.scan( SynFrameCnt ) |
| | 314 | # Parse pointers |
| | 315 | if $scanner.scan( SynPtrCnt ) |
| 331 | | raise "Couldn't parse frame count" |
| 332 | | end |
| | 326 | raise "Couldn't parse pointer count" |
| | 327 | end |
| | 328 | |
| | 329 | # Parse frames if this synset is a verb |
| | 330 | if synsetType == WordNet::Verb |
| | 331 | if $scanner.scan( SynFrameCnt ) |
| | 332 | $scanner[1].to_i.times do |i| |
| | 333 | $scanner.scan( SynFrame ) or raise "unable to parse frame #{i}" |
| | 334 | frames.push "#{$scanner[1]} #{$scanner[2]}" |
| | 335 | end |
| | 336 | else |
| | 337 | raise "Couldn't parse frame count" |
| | 338 | end |
| | 339 | end |
| | 340 | |
| | 341 | # Find the gloss |
| | 342 | if $scanner.scan( SynGloss ) |
| | 343 | gloss = $scanner[1].strip |
| | 344 | end |
| | 345 | |
| | 346 | # This should never happen, as the gloss matches pretty much anything to |
| | 347 | # the end of line. |
| | 348 | if !$scanner.empty? |
| | 349 | raise "Trailing miscellaneous found at end of entry" |
| | 350 | end |
| | 351 | |
| | 352 | # Build the synset entry and return it |
| | 353 | synsetType = WordNet::Adjective if synsetType == WordNet::Other |
| | 354 | key = [ offset, synsetType ].join("%") |
| | 355 | data = [ |
| | 356 | filenum, |
| | 357 | words.join( WordNet::SUB_DELIM ), |
| | 358 | ptrs.join( WordNet::SUB_DELIM ), |
| | 359 | frames.join( WordNet::SUB_DELIM ), |
| | 360 | gloss, |
| | 361 | ].join( WordNet::DELIM ) |
| | 362 | |
| | 363 | return key, data |
| | 364 | rescue => err |
| | 365 | message "Synset did not parse: %s at '%s...' (pos = %s, line %d)\n\t%s\n" % [ |
| | 366 | err.message, |
| | 367 | $scanner.rest[0,20], |
| | 368 | pos.inspect, |
| | 369 | lineNumber, |
| | 370 | err.backtrace[0] |
| | 371 | ] |
| | 372 | return nil |