|
/*
|
|
* Adaptation of Teragram dictionaries to Ruby
|
|
*/
|
|
#include <ruby.h>
|
|
#include <vulgarityfilter.h> // Class: Dictionaries
|
|
#include "teragram.h"
|
|
|
|
//#define _VERBOSE_DEBUG
|
|
#ifdef _VERBOSE_DEBUG
|
|
# include <stdio.h>
|
|
# define teragram_printf(...) printf(__VA_ARGS__)
|
|
#else
|
|
# define teragram_printf(...)
|
|
#endif
|
|
|
|
Dictionaries gstDicts;
|
|
|
|
/* holder for ruby "Teragram" module/exceptions */
|
|
VALUE rb_cTeragram, rb_eTeragramException;
|
|
|
|
/* Filters the input string for vulgarity, masking vulgar content in-place. */
|
|
VALUE teragram_filter(VALUE self, VALUE string)
|
|
{
|
|
teragram_printf( "%s\n", __FUNCTION__ );
|
|
VALUE RetVal = Qnil;
|
|
int iStrLen = 0, ret = 0, iVulgarCounter = 0;
|
|
char* acTmpMsg = NULL;
|
|
char* acOutput = NULL;
|
|
char* input_text = StringValuePtr(string);
|
|
|
|
#if defined _VERBOSE_DEBUG
|
|
fprintf(stderr, "DEBUG: word = '%s'\n", input_text);
|
|
#endif
|
|
|
|
if( !input_text )
|
|
return (VALUE)NULL;//Give me nothing, and I'll give you nothing in return.
|
|
|
|
iStrLen = strlen(input_text);
|
|
|
|
/*//////////////////////////////////////////////////////////////////////////
|
|
// Allocate a temporary buffer to hold the filtered text.
|
|
//////////////////////////////////////////////////////////////////////////*/
|
|
acTmpMsg = (char*)malloc(iStrLen + 1);
|
|
if( !acTmpMsg )
|
|
{
|
|
/* XXX should this just print to stderr and continue instead? */
|
|
rb_raise(rb_eTeragramException, "%", "Vulgarity filter is out of memory");
|
|
|
|
/* If I can't guarantee that the text is NOT vulgar, I have to omit all
|
|
* of it. Return nothing.
|
|
*/
|
|
return (VALUE)NULL;
|
|
}
|
|
|
|
/*//////////////////////////////////////////////////////////////////////////
|
|
// Execute the SCE-RT vulgarity filter on the input text
|
|
//////////////////////////////////////////////////////////////////////////*/
|
|
ret = iMaskVulgarWords(
|
|
&gstDicts,
|
|
input_text,
|
|
acTmpMsg,
|
|
&iVulgarCounter
|
|
);
|
|
|
|
if( ret != FILTER_SUCCESS )
|
|
{
|
|
/* XXX should this just print to stderr and continue instead? */
|
|
rb_raise(rb_eTeragramException, "%", "Failed to perform full-word filtering on input text");
|
|
free(acTmpMsg);
|
|
|
|
/* If I can't guarantee that the text is NOT vulgar, I have to omit all
|
|
* of it. Return nothing.
|
|
*/
|
|
return (VALUE)NULL;
|
|
}
|
|
|
|
/*//////////////////////////////////////////////////////////////////////////
|
|
// Perform a substring filter on the content.
|
|
//////////////////////////////////////////////////////////////////////////*/
|
|
acOutput = (char*)malloc(iStrLen + 1);
|
|
ret = iMaskSubstringMatches(
|
|
gstDicts.fpat,
|
|
1,
|
|
acTmpMsg,
|
|
acOutput,
|
|
&iVulgarCounter
|
|
);
|
|
|
|
if( ret != FILTER_SUCCESS )
|
|
{
|
|
rb_raise(rb_eTeragramException, "%", "Failed to perform substring filtering on input text");
|
|
free(acOutput);
|
|
free(acTmpMsg);
|
|
return (VALUE)NULL;
|
|
}
|
|
|
|
// convert char * to Ruby string type
|
|
RetVal = rb_str_new2(acOutput);
|
|
|
|
// free our allocated buffers
|
|
free(acOutput); // filtered output as char *
|
|
free(acTmpMsg); // temp filtered string
|
|
|
|
return RetVal;
|
|
}
|
|
|
|
|
|
VALUE teragram_dictdir(teragram_t* self)
|
|
{
|
|
teragram_printf( "%s(self = 0x%x)\n", __FUNCTION__, self );
|
|
teragram_t* teragram;
|
|
Data_Get_Struct(self, teragram_t, teragram);
|
|
teragram_printf( "teragram->dictdir = '%s'\n", StringValuePtr(teragram->dictdir) );
|
|
|
|
if( self )
|
|
return teragram->dictdir;
|
|
else
|
|
return (VALUE)NULL;
|
|
}
|
|
|
|
|
|
void teragram_mark(teragram_t* self)
|
|
{
|
|
rb_gc_mark(self->dictdir);
|
|
}
|
|
|
|
|
|
void teragram_free(teragram_t* self)
|
|
{
|
|
free(self);
|
|
}
|
|
|
|
|
|
VALUE teragram_allocate(VALUE klass)
|
|
{
|
|
teragram_t *t = malloc(sizeof(teragram_t));
|
|
t->dictdir = Qnil;
|
|
return Data_Wrap_Struct(klass, teragram_mark, teragram_free, t);
|
|
}
|
|
|
|
|
|
VALUE teragram_is_vulgar(VALUE klass, VALUE string)
|
|
{
|
|
int iStrLen = 0, ret = 0;
|
|
char* input_text = StringValuePtr(string);
|
|
|
|
if( input_text && strlen(input_text) )
|
|
{
|
|
////////////////////////////////////////////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// Execute the SCE-RT vulgarity detection on the input text.
|
|
ret = iDetermineVulgarity(&gstDicts, input_text);
|
|
if( (ret == FILTER_FAILED) || (ret == FILTER_VULGAR) )
|
|
return Qtrue;
|
|
|
|
ret = iDetermineSubStringMatch(gstDicts.fpat, input_text);
|
|
if( (ret == FILTER_FAILED) || (ret == FILTER_VULGAR) )
|
|
return Qtrue;
|
|
}
|
|
|
|
return Qfalse;
|
|
}
|
|
|
|
|
|
VALUE teragram_initialize(VALUE self, VALUE dictdir)
|
|
{
|
|
teragram_printf( "%s( 0x%x, '%s' )\n", __FUNCTION__, self, StringValuePtr(dictdir) );
|
|
teragram_t* teragram;
|
|
|
|
if( !rb_respond_to(dictdir, rb_intern("to_s")) )
|
|
rb_raise(rb_eArgError, "dictdir must be a string that responds to to_s");
|
|
|
|
Data_Get_Struct(self, teragram_t, teragram);
|
|
teragram->dictdir = dictdir;
|
|
|
|
int ret = load_vulgarity_dictionaries(StringValuePtr(dictdir), &gstDicts);
|
|
teragram_printf( "load_vulgarity_dictionaries returned %d\n", ret );
|
|
|
|
return self;
|
|
}
|
|
|
|
|
|
void Init_teragram()
|
|
{
|
|
teragram_printf( "%s\n", __FUNCTION__ );
|
|
rb_cTeragram = rb_define_class( "Teragram", rb_cObject );
|
|
rb_eTeragramException = rb_define_class_under( rb_cTeragram, "Exception", rb_eStandardError );
|
|
|
|
rb_define_alloc_func(rb_cTeragram, teragram_allocate);
|
|
|
|
/*rb_define_module_function(rb_cTeragram, "filter", teragram_filter, 1);*/
|
|
rb_define_method(rb_cTeragram, "initialize", teragram_initialize, 1);
|
|
rb_define_method(rb_cTeragram, "dictdir", teragram_dictdir, 0);
|
|
rb_define_method(rb_cTeragram, "filter", teragram_filter, 1);
|
|
rb_define_method(rb_cTeragram, "is_vulgar?", teragram_is_vulgar, 1);
|
|
}
|
|
|