printable version
- email this article
View article without comments
Spam class refactoring
by johnk
Saturday, Oct. 21, 2006 at 1:21 AM
This isn't a full refactor...
This is a slightly refactored version of spam_class.inc, called spamb_class.inc. It just splits the Detect() code into individual methods. That way, it can be used to check spam on other parts of the site (or I guess in general). <pre> include_once("shared/global.cfg");
/** * This is a lot like Spam, except it'll operate on arbitrary strings, not just articles. * The different spam detection methods are in separate functions. */ class SpamB {
function SpamB () { // Nothing }
function detect_strings( &$data ) { $stringstext = file_get_contents(SF_CACHE_PATH.'/spam_strings.txt'); $strings = explode("n",$stringstext); foreach ( $strings as $string ) { $string = rtrim( $string ); if (! $string) continue; $lines = explode( "n", $data ); foreach ( $lines as $line ) { if ( preg_match( "/$string/i", $line ) ) { return 1; } } } return 0; } function detect_ip( $ip ) { $lines = file(SF_CACHE_PATH.'/next_ip_to_block.txt' ,'r'); $user_ip=trim($_SERVER['REMOTE_ADDR']); for ( $i=0 ; $i < count($lines) ; $i++) { $saved_ip = rtrim(trim($lines[$i]));
if ( preg_match( "/^$saved_ip$/", $user_ip ) ) { $this->Log( $_SERVER['REMOTE_ADDR']."|". date("m-d-y g:ia")."|". $_SERVER['HTTP_USER_AGENT']."|". $_SERVER['HTTP_REFERER']."|". $_SERVER['REQUEST_URI'] ); return 1; } } return 0; } /** not tested */ function remember_content( &$data ) { if ( $GLOBALS['spam_filter_time'] < (time()-filectime(SF_CACHE_PATH.'/hashes_time')) ) { unlink(SF_CACHE_PATH.'/hashes.txt'); touch(SF_CACHE_PATH.'/hashes.txt'); touch(SF_CACHE_PATH.'/hashes_time'); unlink(SF_CACHE_PATH.'/hashes_content.txt'); touch(SF_CACHE_PATH.'/hashes_content.txt'); } $hashes = fopen(SF_CACHE_PATH.'/hashes.txt','a'); fputs( $hashes, md5($data)."n" ); fclose ($hashes); } /** * @return the number of matches * not tested */ function detect_repeated_content( &$data ) { $hashes = file(SF_CACHE_PATH.'/hashes_content.txt','r'); $dataHash = md5($data); $matched_hashes = 0; foreach( $hashes as $hash ) { if ( $hash == $dataHash ) $matched_hashes++; } return $matched_hashes; }
function Log ($text) {
$log = fopen(SF_CACHE_PATH."/ipblock.log","a"); fwrite($log, date("m-d-y g:ia").": ".$text."n"); fclose($log); }
function too_many_urls( $str ) { $sansUrls = preg_replace( '/http:[a-zA-Z0-9.\/]+?/', '', $str ); $sansHref = preg_replace( '/href/', '', $sansUrls ); $sansUrl = preg_replace( '/url/', '', $sansHref ); $sansWww = preg_replace( '/www[a-zA-Z0-9.\/]+?/', '', $sansUrl ); if ((sizeof($sansWww)/sizeof($str)) > 0.5) return 1; else return 0; } } </pre>
Ooops - here it is with proper indenting
by johnk
Saturday, Oct. 21, 2006 at 1:21 AM
include_once("shared/global.cfg");
/**
* This is a lot like Spam, except it'll operate on arbitrary strings, not just articles.
* The different spam detection methods are in separate functions.
*/
class SpamB
{
function SpamB ()
{
// Nothing
}
function detect_strings( &$data )
{
$stringstext = file_get_contents(SF_CACHE_PATH.'/spam_strings.txt');
$strings = explode("\n",$stringstext);
foreach ( $strings as $string )
{
$string = rtrim( $string );
if (! $string)
continue;
$lines = explode( "\n", $data );
foreach ( $lines as $line )
{
if ( preg_match( "/$string/i", $line ) )
{
return 1;
}
}
}
return 0;
}
function detect_ip( $ip )
{
$lines = file(SF_CACHE_PATH.'/next_ip_to_block.txt' ,'r');
$user_ip=trim($_SERVER['REMOTE_ADDR']);
for ( $i=0 ; $i < count($lines) ; $i++)
{
$saved_ip = rtrim(trim($lines[$i]));
if ( preg_match( "/^$saved_ip\$/", $user_ip ) )
{
$this->Log( $_SERVER['REMOTE_ADDR']."|".
date("m-d-y g:ia")."|".
$_SERVER['HTTP_USER_AGENT']."|".
$_SERVER['HTTP_REFERER']."|".
$_SERVER['REQUEST_URI'] );
return 1;
}
}
return 0;
}
/** not tested */
function remember_content( &$data )
{
if ( $GLOBALS['spam_filter_time'] < (time()-filectime(SF_CACHE_PATH.'/hashes_time')) ) {
unlink(SF_CACHE_PATH.'/hashes.txt');
touch(SF_CACHE_PATH.'/hashes.txt');
touch(SF_CACHE_PATH.'/hashes_time');
unlink(SF_CACHE_PATH.'/hashes_content.txt');
touch(SF_CACHE_PATH.'/hashes_content.txt');
}
$hashes = fopen(SF_CACHE_PATH.'/hashes.txt','a');
fputs( $hashes, md5($data)."\n" );
fclose ($hashes);
}
/**
* @return the number of matches
* not tested
*/
function detect_repeated_content( &$data )
{
$hashes = file(SF_CACHE_PATH.'/hashes_content.txt','r');
$dataHash = md5($data);
$matched_hashes = 0;
foreach( $hashes as $hash )
{
if ( $hash == $dataHash )
$matched_hashes++;
}
return $matched_hashes;
}
function Log ($text) {
$log = fopen(SF_CACHE_PATH."/ipblock.log","a");
fwrite($log, date("m-d-y g:ia").": ".$text."\n");
fclose($log);
}
function too_many_urls( $str ) {
$sansUrls = preg_replace( '/http:[a-zA-Z0-9.\\/]+?/', '', $str );
$sansHref = preg_replace( '/href/', '', $sansUrls );
$sansUrl = preg_replace( '/url/', '', $sansHref );
$sansWww = preg_replace( '/www[a-zA-Z0-9.\\/]+?/', '', $sansUrl );
if ((sizeof($sansWww)/sizeof($str)) > 0.5)
return 1;
else
return 0;
}
}
Detect strings
by johnk
Saturday, Oct. 21, 2006 at 1:24 AM
I forgot to add that there's a content spam detection method in there. It checks against a file of spam keywords.
I think the code to check for repeeated content and IPs isn't quite right. I never used spamb to replace spam_class.inc. It was used for checking calendar spam.
New version of too_many_urls
by johnk
Saturday, Oct. 21, 2006 at 1:45 AM
I messed it up. Here's a good one:
function too_many_urls( $str ) {
$sansUrls = preg_replace( '/http:[a-zA-Z0-9.\\/]+/', '', $str );
$sansHref = preg_replace( '/href/', '', $sansUrls );
$sansUrl = preg_replace( '/url/', '', $sansHref );
$sansWww = preg_replace( '/www[a-zA-Z0-9.\\/]+/', '', $sansUrl );
if ((strlen($sansWww)/strlen($str)) < 0.5)
return 1;
else
return 0;
}
|