| 38 | | module WordNet |
| 39 | | |
| 40 | | ### Lexicon exception - something has gone wrong in the internals of the |
| 41 | | ### lexicon. |
| 42 | | class LexiconError < StandardError ; end |
| 43 | | |
| 44 | | ### Lookup error - the object being looked up either doesn't exist or is |
| 45 | | ### malformed |
| 46 | | class LookupError < StandardError ; end |
| 47 | | |
| 48 | | ### WordNet lexicon class - abstracts access to the WordNet lexical |
| 49 | | ### databases, and provides factory methods for looking up and creating new |
| 50 | | ### WordNet::Synset objects. |
| 51 | | class Lexicon |
| 52 | | include WordNet::Constants |
| 53 | | include CrossCase if defined?( CrossCase ) |
| 54 | | |
| 55 | | # Class constants |
| 56 | | Version = /([\d\.]+)/.match( %q{$Revision: 1.4 $} )[1] |
| 57 | | Rcsid = %q$Id$ |
| 58 | | |
| 59 | | ############################################################# |
| 60 | | ### B E R K E L E Y D B C O N F I G U R A T I O N |
| 61 | | ############################################################# |
| 62 | | |
| 63 | | # The path to the WordNet BerkeleyDB Env. It lives in the directory that |
| 64 | | # this module is in. |
| 65 | | DbFile = File::join( File::dirname(__FILE__), "lexicon" ) |
| 66 | | |
| 67 | | # Options for the creation of the Env object |
| 68 | | EnvOptions = { |
| 69 | | :set_timeout => 50, |
| 70 | | :set_lk_detect => 1, |
| 71 | | :set_verbose => false, |
| | 39 | ### Lexicon exception - something has gone wrong in the internals of the |
| | 40 | ### lexicon. |
| | 41 | class WordNet::LexiconError < StandardError ; end |
| | 42 | |
| | 43 | ### Lookup error - the object being looked up either doesn't exist or is |
| | 44 | ### malformed |
| | 45 | class WordNet::LookupError < StandardError ; end |
| | 46 | |
| | 47 | ### WordNet lexicon class - abstracts access to the WordNet lexical |
| | 48 | ### databases, and provides factory methods for looking up and creating new |
| | 49 | ### WordNet::Synset objects. |
| | 50 | class WordNet::Lexicon |
| | 51 | include WordNet::Constants |
| | 52 | include CrossCase if defined?( CrossCase ) |
| | 53 | |
| | 54 | # Subversion Id |
| | 55 | SvnId = %q$Id$ |
| | 56 | |
| | 57 | # Subversion revision |
| | 58 | SvnRev = %q$Rev$ |
| | 59 | |
| | 60 | |
| | 61 | ############################################################# |
| | 62 | ### B E R K E L E Y D B C O N F I G U R A T I O N |
| | 63 | ############################################################# |
| | 64 | |
| | 65 | # The path to the WordNet BerkeleyDB Env. It lives in the directory that |
| | 66 | # this module is in. |
| | 67 | DefaultDbEnv = File::join( Config::CONFIG['datadir'], "ruby-wordnet" ) |
| | 68 | |
| | 69 | # Options for the creation of the Env object |
| | 70 | EnvOptions = { |
| | 71 | :set_timeout => 50, |
| | 72 | :set_lk_detect => 1, |
| | 73 | :set_verbose => false, |
| | 74 | } |
| | 75 | |
| | 76 | # Flags for the creation of the Env object |
| | 77 | EnvFlags = BDB::CREATE|BDB::INIT_TRANSACTION|BDB::RECOVER |
| | 78 | |
| | 79 | # Table names (actually database names in BerkeleyDB) |
| | 80 | TableNames = { |
| | 81 | :index => "index", |
| | 82 | :data => "data", |
| | 83 | :morph => "morph", |
| | 84 | } |
| | 85 | |
| | 86 | |
| | 87 | |
| | 88 | ############################################################# |
| | 89 | ### I N S T A N C E M E T H O D S |
| | 90 | ############################################################# |
| | 91 | |
| | 92 | ### Create a new WordNet::Lexicon object that will read its data from |
| | 93 | ### the given +dbenv+ (a BerkeleyDB env directory). The database will be |
| | 94 | ### opened with the specified +mode+, which can either be a numeric |
| | 95 | ### octal mode (e.g., 0444) or one of (:readonly, :readwrite). |
| | 96 | def initialize( dbenv=DefaultDbEnv, mode=:readonly ) |
| | 97 | raise ArgumentError, "Cannot find data directory '#{dbenv}'" unless |
| | 98 | File::directory?( dbenv ) |
| | 99 | |
| | 100 | mode = normalize_mode( mode ) |
| | 101 | |
| | 102 | begin |
| | 103 | @env = BDB::Env::new( dbenv, EnvFlags, EnvOptions ) |
| | 104 | @indexDb = @env.open_db( BDB::BTREE, "index", nil, BDB::CREATE, mode ) |
| | 105 | @dataDb = @env.open_db( BDB::BTREE, "data", nil, BDB::CREATE, mode ) |
| | 106 | @morphDb = @env.open_db( BDB::BTREE, "morph", nil, BDB::CREATE, mode ) |
| | 107 | rescue StandardError => err |
| | 108 | msg = "Error while opening Ruby-WordNet data files: %s" % err.message |
| | 109 | raise err.exception( msg ) |
| | 110 | end |
| | 111 | end |
| | 112 | |
| | 113 | |
| | 114 | |
| | 115 | ###### |
| | 116 | public |
| | 117 | ###### |
| | 118 | |
| | 119 | # The BDB::Env object which contains the wordnet lexicon's databases. |
| | 120 | attr_reader :env |
| | 121 | |
| | 122 | # The handle to the index table |
| | 123 | attr_reader :indexDb |
| | 124 | |
| | 125 | # The handle to the synset data table |
| | 126 | attr_reader :dataDb |
| | 127 | |
| | 128 | # The handle to the morph table |
| | 129 | attr_reader :morphDb |
| | 130 | |
| | 131 | |
| | 132 | ### Close the lexicon's database environment |
| | 133 | def close |
| | 134 | @env.close |
| | 135 | end |
| | 136 | |
| | 137 | |
| | 138 | ### Checkpoint the database. (BerkeleyDB-specific) |
| | 139 | def checkpoint( bytes=0, minutes=0 ) |
| | 140 | @env.checkpoint |
| | 141 | end |
| | 142 | |
| | 143 | |
| | 144 | ### Return a list of archival logfiles that can be removed |
| | 145 | ### safely. (BerkeleyDB-specific). |
| | 146 | def archlogs |
| | 147 | return @env.log_archive( BDB::ARCH_ABS ) |
| | 148 | end |
| | 149 | |
| | 150 | |
| | 151 | ### Remove any archival logfiles for the lexicon's database |
| | 152 | ### environment. (BerkeleyDB-specific). |
| | 153 | def cleanLogs |
| | 154 | self.archlogs.each {|logfile| |
| | 155 | File::chmod( 0777, logfile ) |
| | 156 | File::delete( logfile ) |
| 73 | | |
| 74 | | # Flags for the creation of the Env object |
| 75 | | EnvFlags = BDB::CREATE|BDB::INIT_TRANSACTION|BDB::RECOVER |
| 76 | | |
| 77 | | # Table names (actually database names in BerkeleyDB) |
| 78 | | TableNames = { |
| 79 | | :index => "index", |
| 80 | | :data => "data", |
| 81 | | :morph => "morph", |
| | 158 | end |
| | 159 | |
| | 160 | |
| | 161 | ### Returns an integer of the familiarity/polysemy count for +word+ as a |
| | 162 | ### +partOfSpeech+. Note that polysemy can be identified for a given |
| | 163 | ### word by counting the synsets returned by #lookupSynsets. |
| | 164 | def familiarity( word, partOfSpeech, polyCount=nil ) |
| | 165 | wordkey = self.makeWordKey( word, partOfSpeech ) |
| | 166 | return nil unless @indexDb.key?( wordkey ) |
| | 167 | @indexDb[ wordkey ].split( WordNet::SubDelimRe ).length |
| | 168 | end |
| | 169 | |
| | 170 | |
| | 171 | ### Look up sysets (Wordnet::Synset objects) matching +text+ as a |
| | 172 | ### +partOfSpeech+, where +partOfSpeech+ is one of +WordNet::Noun+, |
| | 173 | ### +WordNet::Verb+, +WordNet::Adjective+, or +WordNet::Adverb+. Without |
| | 174 | ### +sense+, #lookupSynsets will return all matches that are a |
| | 175 | ### +partOfSpeech+. If +sense+ is specified, only the synset object that |
| | 176 | ### matches that particular +partOfSpeech+ and +sense+ is returned. |
| | 177 | def lookupSynsets( word, partOfSpeech, sense=nil ) |
| | 178 | wordkey = self.makeWordKey( word, partOfSpeech ) |
| | 179 | pos = self.makePos( partOfSpeech ) |
| | 180 | synsets = [] |
| | 181 | |
| | 182 | # Look up the index entry, trying first the word as given, and if |
| | 183 | # that fails, trying morphological conversion. |
| | 184 | entry = @indexDb[ wordkey ] |
| | 185 | if entry.nil? && (word = self.morph( word, partOfSpeech )) |
| | 186 | entry = @indexDb[ wordkey ] |
| | 187 | end |
| | 188 | |
| | 189 | # If the lookup failed both ways, just abort |
| | 190 | return nil unless entry |
| | 191 | |
| | 192 | # Make synset keys from the entry, narrowing it to just the sense |
| | 193 | # requested if one was specified. |
| | 194 | synkeys = entry.split( SubDelimRe ).collect {|off| "#{off}%#{pos}" } |
| | 195 | if sense |
| | 196 | return lookupSynsetsByKey( synkeys[sense - 1] ) |
| | 197 | else |
| | 198 | return [ lookupSynsetsByKey(*synkeys) ].flatten |
| | 199 | end |
| | 200 | end |
| | 201 | |
| | 202 | |
| | 203 | ### Returns the WordNet::Synset objects corresponding to the +keys+ |
| | 204 | ### specified. The +keys+ are made up of the target synset's "offset" |
| | 205 | ### and syntactic category catenated together with a '%' character. |
| | 206 | def lookupSynsetsByKey( *keys ) |
| | 207 | synsets = [] |
| | 208 | |
| | 209 | keys.each {|key| |
| | 210 | raise LookupError, "Failed lookup of synset '#{key}':"\ |
| | 211 | "No such synset" unless @dataDb.key?( key ) |
| | 212 | |
| | 213 | data = @dataDb[ key ] |
| | 214 | offset, partOfSpeech = key.split( /%/, 2 ) |
| | 215 | synsets << WordNet::Synset::new( self, offset, partOfSpeech, nil, data ) |
| 84 | | |
| 85 | | |
| 86 | | ############################################################# |
| 87 | | ### I N S T A N C E M E T H O D S |
| 88 | | ############################################################# |
| 89 | | |
| 90 | | ### Create a new WordNet::Lexicon object. |
| 91 | | def initialize |
| 92 | | Dir::mkdir( DbFile ) unless File::directory?( DbFile ) |
| 93 | | |
| 94 | | @env = BDB::Env::new( DbFile, EnvFlags, EnvOptions ) |
| 95 | | @indexDb = @env.open_db( BDB::BTREE, "index", nil, BDB::CREATE, 0666 ) |
| 96 | | @dataDb = @env.open_db( BDB::BTREE, "data", nil, BDB::CREATE, 0666 ) |
| 97 | | @morphDb = @env.open_db( BDB::BTREE, "morph", nil, BDB::CREATE, 0666 ) |
| | 218 | return *synsets |
| | 219 | end |
| | 220 | alias_method :lookupSynsetsByOffset, :lookupSynsetsByKey |
| | 221 | |
| | 222 | |
| | 223 | ### Returns a form of +word+ as a part of speech +partOfSpeech+, as |
| | 224 | ### found in the WordNet morph files. The #lookupSynsets method perfoms |
| | 225 | ### morphological conversion automatically, so a call to #morph is not |
| | 226 | ### required. |
| | 227 | def morph( word, partOfSpeech ) |
| | 228 | return @morphDb[ self.makeWordKey(word, partOfSpeech) ] |
| | 229 | end |
| | 230 | |
| | 231 | |
| | 232 | ### Returns the result of looking up +word+ in the inverse of the WordNet |
| | 233 | ### morph files. _(This is undocumented in Lingua::Wordnet)_ |
| | 234 | def reverseMorph( word ) |
| | 235 | @morphDb.invert[ word ] |
| | 236 | end |
| | 237 | |
| | 238 | |
| | 239 | ### Returns an array of compound words matching +text+. |
| | 240 | def grep( text ) |
| | 241 | return [] if text.empty? |
| | 242 | |
| | 243 | words = [] |
| | 244 | |
| | 245 | # Grab a cursor into the database and fetch while the key matches |
| | 246 | # the target text |
| | 247 | cursor = @indexDb.cursor |
| | 248 | rec = cursor.set_range( text ) |
| | 249 | while /^#{text}/ =~ rec[0] |
| | 250 | words.push rec[0] |
| | 251 | rec = cursor.next |
| 99 | | |
| 100 | | |
| 101 | | ###### |
| 102 | | public |
| 103 | | ###### |
| 104 | | |
| 105 | | # The BDB::Env object which contains the wordnet lexicon's databases. |
| 106 | | attr_reader :env |
| 107 | | |
| 108 | | # The handle to the index table |
| 109 | | attr_reader :indexDb |
| 110 | | |
| 111 | | # The handle to the synset data table |
| 112 | | attr_reader :dataDb |
| 113 | | |
| 114 | | # The handle to the morph table |
| 115 | | attr_reader :morphDb |
| 116 | | |
| 117 | | |
| 118 | | ### Checkpoint the database. (BerkeleyDB-specific) |
| 119 | | def checkpoint( bytes=0, minutes=0 ) |
| 120 | | @env.checkpoint |
| | 253 | cursor.close |
| | 254 | |
| | 255 | return *words |
| | 256 | end |
| | 257 | |
| | 258 | |
| | 259 | ### Factory method: Creates and returns a new WordNet::Synset object in |
| | 260 | ### this lexicon for the specified +word+ and +partOfSpeech+. |
| | 261 | def createSynset( word, partOfSpeech ) |
| | 262 | return WordNet::Synset::new( self, '', partOfSpeech, word ) |
| | 263 | end |
| | 264 | alias_method :newSynset, :createSynset |
| | 265 | |
| | 266 | |
| | 267 | ### Store the specified +synset+ (a WordNet::Synset object) in the |
| | 268 | ### lexicon. Returns the key of the stored synset. |
| | 269 | def storeSynset( synset ) |
| | 270 | strippedOffset = nil |
| | 271 | pos = nil |
| | 272 | |
| | 273 | # Start a transaction |
| | 274 | @env.begin( BDB::TXN_COMMIT, @dataDb ) do |txn,datadb| |
| | 275 | |
| | 276 | # If this is a new synset, generate an offset for it |
| | 277 | if synset.offset == 1 |
| | 278 | synset.offset = |
| | 279 | (datadb['offsetcount'] = datadb['offsetcount'].to_i + 1) |
| | 280 | end |
| | 281 | |
| | 282 | # Write the data entry |
| | 283 | datadb[ synset.key ] = synset.serialize |
| | 284 | |
| | 285 | # Write the index entries |
| | 286 | txn.begin( BDB::TXN_COMMIT, @indexDb ) do |txn,indexdb| |
| | 287 | |
| | 288 | # Make word/part-of-speech pairs from the words in the synset |
| | 289 | synset.words.collect {|word| word + "%" + pos }.each {|word| |
| | 290 | |
| | 291 | # If the index already has this word, but not this |
| | 292 | # synset, add it |
| | 293 | if indexdb.key?( word ) |
| | 294 | indexdb[ word ] << SubDelim << synset.offset unless |
| | 295 | indexdb[ word ].include?( synset.offset ) |
| | 296 | else |
| | 297 | indexdb[ word ] = synset.offset |
| | 298 | end |
| | 299 | } |
| | 300 | end # transaction on @indexDb |
| | 301 | end # transaction on @dataDB |
| | 302 | |
| | 303 | return synset.offset |
| | 304 | end |
| | 305 | |
| | 306 | |
| | 307 | ### Remove the specified +synset+ (a WordNet::Synset object) in the |
| | 308 | ### lexicon. Returns the offset of the stored synset. |
| | 309 | def removeSynset( synset ) |
| | 310 | # If it's not in the database (ie., doesn't have a real offset), |
| | 311 | # just return. |
| | 312 | return nil if synset.offset == 1 |
| | 313 | |
| | 314 | # Start a transaction on the data table |
| | 315 | @env.begin( BDB::TXN_COMMIT, @dataDb ) do |txn,datadb| |
| | 316 | |
| | 317 | # First remove the index entries for this synset by iterating |
| | 318 | # over each of its words |
| | 319 | txn.begin( BDB::TXN_COMMIT, @indexDb ) do |txn,indexdb| |
| | 320 | synset.words.collect {|word| word + "%" + pos }.each {|word| |
| | 321 | |
| | 322 | # If the index contains an entry for this word, either |
| | 323 | # splice out the offset for the synset being deleted if |
| | 324 | # there are more than one, or just delete the whole |
| | 325 | # entry if it's the only one. |
| | 326 | if indexdb.key?( word ) |
| | 327 | offsets = indexdb[ word ]. |
| | 328 | split( SubDelimRe ). |
| | 329 | reject {|offset| offset == synset.offset} |
| | 330 | |
| | 331 | unless offsets.empty? |
| | 332 | indexDb[ word ] = newoffsets.join( SubDelim ) |
| | 333 | else |
| | 334 | indexDb.delete( word ) |
| | 335 | end |
| | 336 | end |
| | 337 | } |
| | 338 | end |
| | 339 | |
| | 340 | # :TODO: Delete synset from pointers of related synsets |
| | 341 | |
| | 342 | # Delete the synset from the main db |
| | 343 | datadb.delete( synset.offset ) |
| 123 | | |
| 124 | | ### Return a list of archival logfiles that can be removed |
| 125 | | ### safely. (BerkeleyDB-specific). |
| 126 | | def archlogs |
| 127 | | return @env.log_archive( BDB::ARCH_ABS ) |
| 128 | | end |
| 129 | | |
| 130 | | |
| 131 | | ### Remove any archival logfiles for the lexicon's database |
| 132 | | ### environment. (BerkeleyDB-specific). |
| 133 | | def cleanLogs |
| 134 | | self.archlogs.each {|logfile| |
| 135 | | File::chmod( 0777, logfile ) |
| 136 | | File::delete( logfile ) |
| 137 | | } |
| 138 | | end |
| 139 | | |
| 140 | | |
| 141 | | ### Returns an integer of the familiarity/polysemy count for +word+ as a |
| 142 | | ### +partOfSpeech+. Note that polysemy can be identified for a given |
| 143 | | ### word by counting the synsets returned by #lookupSynsets. |
| 144 | | def familiarity( word, partOfSpeech, polyCount=nil ) |
| 145 | | wordkey = self.makeWordKey( word, partOfSpeech ) |
| 146 | | return nil unless @indexDb.key?( wordkey ) |
| 147 | | @indexDb[ wordkey ].split( WordNet::SubDelimRe ).length |
| 148 | | end |
| 149 | | |
| 150 | | |
| 151 | | ### Look up sysets (Wordnet::Synset objects) matching +text+ as a |
| 152 | | ### +partOfSpeech+, where +partOfSpeech+ is one of +WordNet::Noun+, |
| 153 | | ### +WordNet::Verb+, +WordNet::Adjective+, or +WordNet::Adverb+. Without |
| 154 | | ### +sense+, #lookupSynsets will return all matches that are a |
| 155 | | ### +partOfSpeech+. If +sense+ is specified, only the synset object that |
| 156 | | ### matches that particular +partOfSpeech+ and +sense+ is returned. |
| 157 | | def lookupSynsets( word, partOfSpeech, sense=nil ) |
| 158 | | wordkey = self.makeWordKey( word, partOfSpeech ) |
| 159 | | pos = self.makePos( partOfSpeech ) |
| 160 | | synsets = [] |
| 161 | | |
| 162 | | # Look up the index entry, trying first the word as given, and if |
| 163 | | # that fails, trying morphological conversion. |
| 164 | | entry = @indexDb[ wordkey ] |
| 165 | | if entry.nil? && (word = self.morph( word, partOfSpeech )) |
| 166 | | entry = @indexDb[ wordkey ] |
| 167 | | end |
| 168 | | |
| 169 | | # If the lookup failed both ways, just abort |
| 170 | | return nil unless entry |
| 171 | | |
| 172 | | # Make synset keys from the entry, narrowing it to just the sense |
| 173 | | # requested if one was specified. |
| 174 | | synkeys = entry.split( SubDelimRe ).collect {|off| "#{off}%#{pos}" } |
| 175 | | if sense |
| 176 | | return lookupSynsetsByKey( synkeys[sense - 1] ) |
| 177 | | else |
| 178 | | return [ lookupSynsetsByKey(*synkeys) ].flatten |
| 179 | | end |
| 180 | | end |
| 181 | | |
| 182 | | |
| 183 | | ### Returns the WordNet::Synset objects corresponding to the +keys+ |
| 184 | | ### specified. The +keys+ are made up of the target synset's "offset" |
| 185 | | ### and syntactic category catenated together with a '%' character. |
| 186 | | def lookupSynsetsByKey( *keys ) |
| 187 | | synsets = [] |
| 188 | | |
| 189 | | keys.each {|key| |
| 190 | | raise LookupError, "Failed lookup of synset '#{key}':"\ |
| 191 | | "No such synset" unless @dataDb.key?( key ) |
| 192 | | |
| 193 | | data = @dataDb[ key ] |
| 194 | | offset, partOfSpeech = key.split( /%/, 2 ) |
| 195 | | synsets << Synset::new( self, offset, partOfSpeech, nil, data ) |
| 196 | | } |
| 197 | | |
| 198 | | return *synsets |
| 199 | | end |
| 200 | | alias_method :lookupSynsetsByOffset, :lookupSynsetsByKey |
| 201 | | |
| 202 | | |
| 203 | | ### Returns a form of +word+ as a part of speech +partOfSpeech+, as |
| 204 | | ### found in the WordNet morph files. The #lookupSynsets method perfoms |
| 205 | | ### morphological conversion automatically, so a call to #morph is not |
| 206 | | ### required. |
| 207 | | def morph( word, partOfSpeech ) |
| 208 | | return @morphDb[ self.makeWordKey(word, partOfSpeech) ] |
| 209 | | end |
| 210 |