BEGIN { my $b__dir = (-d '/home/jockmyfr/perl'?'/home/jockmyfr/perl':( getpwuid($>) )[7].'/perl'); unshift @INC,$b__dir.'5/lib/perl5',$b__dir.'5/lib/perl5/x86_64-linux',map { $b__dir . $_ } @INC; } #!usr/perl/bin; use strict; #Perl module to restrict unsafe constructs use LWP::UserAgent; #Web user agent class use URI::URL; #Uniform Resource Locators use HTML::Entities; #Encode or decode strings with HTML entities use URI::Escape; use LWP::Simple; use HTTP::Cookies; use Cwd; use DBI; print "\n\n**********************************************************"; print "\n\n* DATA EXTRACTION : http://www.amazon.com/ *"; print "\n\n* - GOJAJ Intelligent Design *"; print "\n\n* - Developed by John Akhile Jr *"; print "\n\n**********************************************************\n\n"; #modify your table name here. my $dbname="kixcomec_scrape"; my $server="localhost"; my $uname="kixcomec_scrape"; my $pwd='3.2Billion$'; mkdir "images_viviennewestwood", 0777 unless -d "images_viviennewestwood"; my $ua=LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"); #$ua->timeout; $ua->max_redirect('1'); my $cookie_jar = HTTP::Cookies->new(file=>$0."_cookie.txt",autosave => 1,); $ua->cookie_jar($cookie_jar); my $dsn="DBI:mysql:$dbname:$server"; my $dbh; if(!($dbh=DBI->connect($dsn,"$uname","$pwd",{AutoCommit => 1}))) { print"\n\nDatabase Connection Failure\n"; print "try again...\n\n"; exit; } $dbh-> {'LongTruncOk'} = 1; $dbh-> {'LongReadLen'} = 90000; $dbh->do("set character set utf8"); my $home_url="http://www.amazon.com/"; &get_cont($home_url,$home_url,'GET'); my $start_url="http://www.amazon.com/s/ref=brand_abn_shoes?ie=UTF8&bbn=672123011&rh=n%3A672123011%2Cp_4%3AVivienne%20Westwood&rnid=672123011"; my $dir = getcwd; my $count=1; my $content=&get_cont($start_url,$home_url,'GET'); while($content=~m/]*?href\=\"([^>]+?)\"[^>]*>\s*\s*([W|M|J|G|B][^>]+?)\s*new_abs($gender_url,$home_url); print "Gender :: $gender\n"; $gender=~s/\'/\'\'/igs; my $gender_content=&get_cont($gender_url,$home_url,'GET'); if($gender_content=~m/$cat<\/strong>([\w\W]+?)<\/ul>/is) { my $block_content=$1; while($block_content=~m/]*?href\=\"([^>]+?)\"[^>]*>\s*\s*([^>]+?)\s*new_abs($category_url,$home_url); print "\tProduct Category :: $product_category\n"; $product_category=~s/\'/\'\'/igs; my $category_content=&get_cont($category_url,$home_url,'GET'); if($category_content=~m/[^>]*?<\/strong>\s*<\/li>\s*\s*(]*?>[\w\W]+?)<\/ul>/is) { my $category_block=$1; while($category_block=~m/]*?href\=\"([^>]+?)\"[^>]*>\s*\s*([^>]+?)\s*new_abs($sub_category_url,$home_url); print "\t\tSub Category :: $sub_category\n\n"; $sub_category=~s/\'/\'\'/igs; my $sub_category_content=&get_cont($sub_category_url,$home_url,'GET'); &get_data($sub_category_content,$gender,$product_category,"Shoes>$gender>$product_category>$sub_category",$sub_category); } } else { &get_data($category_content,$gender,$product_category,"Shoes>$gender>$product_category"); } } } } sub get_data { my($product_content,$gender,$product_category,$prod_tag,$sub_category)=@_; my($product_name,$product_code,$model,$product_price,$description,$color,$image_file,$image_name,$prod_size,$size_name,$color_name,$prod_color); nextpage: if($product_content=~m/]*>([\w\W]+?)/is) { my $block=$1; while($block=~m/\s*]*?href\=\"([^>]+?)\"[^>]*>\s*]*>/igs) { my $source_url=$1; $source_url=URI::URL->new_abs($source_url,$home_url); my $final_content=&get_cont($source_url,$home_url,'GET'); if($final_content=~m/]*>\s*<[^>]*>\s*([^>]+?)\s*]+?)\"\s*\/>/is) { $product_code=$1; $product_code=~s/\'/\'\'/igs; } if($final_content=~m/>\s*Item\s*model\s*number\:\s*<[^>]*>\s*([^>]+?)\s*\s*([\w\W]+?)\s*<\/ul>/is) { $description=$1; $description=~s/<[^>]*?>//igs; $description=~s/\s+/ /igs; $description=~s/^\s+|\s+$//igs; $description=~s/\'/\'\'/igs; print "Product Description :: $description\n"; } if($final_content=~m/\'color_name\'\s*\:\s*\[\s*([\w\W]+?)\s*\]/is) { my $color_block=$1; while($color_block=~m/\'\s*([^>]+?)\s*\'/igs) { $color=$1; print "Color :: $color\n"; $color=~s/\*+//igs; if($final_content=~m/\"$color\"\:\[\{([\w\W]+?)\}\]/is) { my $image_block=$1; my $col=1; $image_file=""; $image_name=""; while($image_block=~m/\"main\"\:\[\"([^>]+?)\"/igs) { my $img_src=$1; if($img_src=~m/(?:.+\/)([^>]*?\.[a-z]+)/is) { my $img_fname=$1; getstore($img_src,"images_viviennewestwood/$img_fname"); $img_fname=$dir."/images_viviennewestwood/$img_fname"; $image_name=$image_name."Product_Image_filename_".$col.","; $img_fname=~s/\'/\'\'/igs; $image_file=$image_file."\'$img_fname\',"; $col++; } undef($img_src); last if($col>5); } undef($image_block); $image_file=~s/\,$//igs; $image_name=~s/\,$//igs; } my $col=1; $prod_size=","; $size_name=","; my $asin_list=""; $product_price=0; if($final_content=~m/]*?name\=\"childVariationASIN\"[^>]*?>([\w\W]+?)<\/select>/is) { my $size_block=$1; while($size_block=~m/]+?)\"\s*title\=\"\s*([^>]+?)\s*\s*\:\:\s*$color\"[^>]*?>/igs) { $asin_list=$asin_list.$1.","; my $size=decode_entities($2); $size=~s/\'/\'\'/igs; $prod_size=$prod_size."\'".$size."\',"; $size_name=$size_name."Product_Available_Size_".$col.","; if($col%10==0) { $asin_list=~s/\,$//igs; my $price_content=&get_cont("http://www.amazon.com/gp/twister/ajax/prefetch/177-1721571-7723263?json=1&sid=177-1721571-7723263&rid=037TS7MMN2XD6J8BEV33&parentAsin=$product_code&qid=&sr=&asinList=$asin_list&productGroupID=shoes_display_on_website&merchantID=&PowerBar=0&pfWrapFeatures=0&rps=0&_=1324294703674",$home_url,'GET'); while($price_content=~m/\$([\d\.]+)\s*=$product_price); } $asin_list=""; } $col++; last if($col>20); } $asin_list=~s/\,$//igs; my $price_content=&get_cont("http://www.amazon.com/gp/twister/ajax/prefetch/177-1721571-7723263?json=1&sid=177-1721571-7723263&rid=037TS7MMN2XD6J8BEV33&parentAsin=$product_code&qid=&sr=&asinList=$asin_list&productGroupID=shoes_display_on_website&merchantID=&PowerBar=0&pfWrapFeatures=0&rps=0&_=1324294703674",$home_url,'GET'); while($price_content=~m/\$([\d\.]+)\s*=$product_price); } print "Product Price :: $product_price\n"; } $size_name=~s/\,$//igs; $prod_size=~s/\,$//igs; print "\n\n"; my $query="insert into Amazon_Vivienne(Gender,Product_Category,Sub_Category,Source_Url,Product_Code,Product_Model,Product_Name,Color,Description,Product_Price,".$image_name."".$size_name.") Values(\'$gender\',\'$product_category\',\'$sub_category\',\'$source_url\',\'$product_code\',\'$model\',\'$product_name\',\'$color\',\'$description\',\'$product_price\',".$image_file."".$prod_size.")"; $query=~s/\,+/\,/igs; &DBProcess($query); undef($query); } undef($color_block); } $count++; } } if($product_content=~m/]*?href\=\"([^>]+?pg_([\d]+)[^>]+?)\"[^>]*?>Next[^>]*?new_abs($next_url,$home_url); $product_content=&get_cont($next_url,$home_url,'GET'); goto nextpage; } } sub get_cont { my ($g_url,$home_url,$method,$content)=@_; my ($req,$res,$code); my $debug=0; start1: if($method eq 'GET') { $req= HTTP::Request->new(GET=>$g_url); $req->header("Content-Type" => "application/x-www-form-urlencoded"); $req->header("Accept"=>"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $res= $ua->request($req); $cookie_jar->extract_cookies($res); $cookie_jar->save; $cookie_jar->add_cookie_header($req); $code=$res->code(); } elsif($method eq 'POST') { $req = HTTP::Request -> new(POST => "$g_url"); $req->header("Content-Type" => "application/x-www-form-urlencoded"); $req->header("Accept"=>"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $req->content($content); $res= $ua->request($req); $cookie_jar->extract_cookies($res); $cookie_jar->save; $cookie_jar->add_cookie_header($req); $code=$res->code(); } if($code=~m/40/is) { print "\nURL missing .Page not found\n"; return; } elsif($code=~m/50/is) { $req=HTTP::Request->new(GET=>"http://www.cpan.org"); $req->header("Content-Type"=> "application/x-www-form-urlencoded"); $res=$ua->request($req); my $code1=$res->code(); print "\nCODE1:$code1\n"; if($code1=~m/50/is) { print "\nNet Failure"; sleep(1000); goto start1; } elsif($code1=~m/20/is) { print "\nURL missing .Page not found\n" if $debug==1; return if $debug==1; $debug=1; sleep(50); goto start1; } } elsif($code=~m/30/is) { my $location=$res->header('location'); my $loc_url=URI::URL->new_abs($location,$home_url); print "\nLOC :: $loc_url\n"; &get_cont($loc_url,$home_url,'GET',''); } elsif($code=~m/20/is) { my $g_con=$res->content; return $g_con; } } # Data Base Query Execute sub DBProcess($) { my $query=shift; my $sth=$dbh->prepare($query); if($sth->execute()) { return $sth; $sth->finish(); } else { my $err=$DBI::errstr; if($err=~m/duplicate/is) { return; } elsif($err=~m/syntax/is) { print "\nsyntax error\n"; open(FH,">>Amazon_Logfile.txt"); print FH "\n$query\;"; close FH; } else { open(FH,">>Amazon_Logfile.txt"); print FH "\n$query\;"; close FH; Connect_Again: if($dbh=DBI->connect($dsn,"$uname","$pwd",{AutoCommit => 1})) { $dbh-> {'LongTruncOk'} = 1; $dbh-> {'LongReadLen'} = 1000000; } else { sleep 10; goto Connect_Again; } } } } -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user_search" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user_add" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user_modify" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user_delete" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="user_trans" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="authenticate" yes -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="message" -----------------------------82368347011361771591525345383 Content-Disposition: form-data; name="file_name_convention" Zy009876.jpg