User:Cedars/gaauto.pl

The following Perl script is a hack that automatically creates a categorized list of good articles in the same format as the good articles page. The script understands quoted, italicized and disguised article links. It uses the existing list as a basis for the new list. It removes old good articles from the revised list and offers the user the opportunity to categorize new good articles. It sorts and counts every article and can automatically adapt to use new headings and subheadings. It allows dual listings and major headings. The script uses cURL to download existing content and the Roman 1.1 Perl module to sort Final Fantasy titles. The script is designed to assist the human editing of Wikipedia articles, not replace it. It is best that users still add and remove articles from the list as they would without the script - this is because they are likely to categorize the items better than the script user. Please feel free to make changes to this page if you feel they would improve the script. If you have comments on the script please feel free to post them on the talk page.

A brief note on output

The script downloads several files to the working directory and outputs two files. The first file, "output_headings.txt", is a file listing the levels and sublevels available for catgorization. This file is output before any requests for categorization are made. The second file, "output.txt", is the formatted wiki-syntax for the list. It may appear corrupt if not opened using UTF-8. The script also outputs a timestamp list of when the most recently added articles were added, "stamp.time", and a backup of the previous version of that list, "stamp.bac". If the timestamp list and backup are dramatically different from each other the script will refuse to run. This is to prevent the timestamp list from becoming distorted and thus damaging the recently added list.

To view script properly use edit mode

#! /usr/bin/perluse Roman;use open ':utf8';# Download a fresh copy of files$DOWNLOAD = 1;# Warn of removed articles$REMOVED = 1;# Number of new articles to remember$NEWARTICLES = 14;# Should open web browser or text editor$ADVANCED = 0;# Adds section comments (improves editing)$SECTIONCOMMENTS = 0;# Web broswer and text editor commands$WEBBROWSER = "open";$TEXTEDITOR = "open";# Sorts article titlessub titlesort { return titlecmp($a, $b); }sub titlecmp {# Grab name%xh = %{shift()};$x = $xh{"name"};%yh = %{shift()};$y = $yh{"name"};if ($x =~ /Final Fantasy [X|V|I]+/ && $y =~ /Final Fantasy [X|V|I]+/) {# Handle Final Fantasy titles$x =~ /Final Fantasy ([X|V|I]+)/;$x = arabic($1);$y =~ /Final Fantasy ([X|V|I]+)/;$y = arabic($1);return $x <=> $y;}else {# Handle other titles$x =~ s/~~.*//g;$x =~ s/''//g;$y =~ s/~~.*//g;$y =~ s/''//g;if ($x =~ /.*\|(.*)/) {$x = $1;}if ($y =~ /.*\|(.*)/) {$y = $1;}return uc($x) cmp uc($y);}}# Sorts article namessub basicsort { return basiccmp($a, $b); }sub basiccmp {%xh = %{shift()};$x = $xh{"name"};%yh = %{shift()};$y = $yh{"name"};$x =~ s/~~.*//g;$x =~ s/''//g;$y =~ s/~~.*//g;$y =~ s/''//g;if ($x =~ /(.*)\|.*/) {$x = $1;}if ($y =~ /(.*)\|.*/) {$y = $1;}return uc($x) cmp uc($y);}# Sorts article time stampssub timesort { return timecmp($a, $b); }sub timecmp {%xh = %{shift()};$x = $xh{"time"};%yh = %{shift()};$y = $yh{"time"};if ($x < 0 && $y < 0) {return 0;}elsif ($x < 0 && $y >= 0) {return 1;}elsif ($x >= 0 && $y < 0) {return -1;}else {return ($x <=> $y) * -1;}}# Keep backup of timestamp fileif (-f "stamp.bac") {$stamp_size = -s "stamp.time";$stamp_bac_size = -s "stamp.bac";if (abs($stamp_size - $stamp_bac_size) > 1024) {print "Large change in timestamp file. This script will now quit to prevent data loss.\n";print "Please delete the \"stamp.bac\" file to continue.\n";exit(1);}}system "cp stamp.time stamp.bac";# Download the current good articles fileif ($DOWNLOAD) {system "curl \"http:https://www.search.com.vn/wiki/index.php?lang=en&q=Wikipedia:Good_articles&action=edit\" > input_ga.html";}# Read the good articles fileopen(FILE, "input_ga.html");@input = <FILE>;close(FILE);$input_len = $#input + 1;# Go through each line of the good articles file$major = -1;$level = -1;$sublevel = 0;$headings_len = 0;$articles_len = 0;$preamble_len = 0;$preamble_on = 0;$main_on = 0;$lang_len = 0;for ($i = 0; $i < $input_len; $i++) {# Get the current line$curline = $input[$i];$curline =~ s/&/&/g;$curline =~ s/</</g;$curline =~ s/>/>/g;$curline =~ s/"/\"/g;# Handle preambleif ($preamble_on) {if ($curline =~ /Gapages/) {$preamble_on = 0;$main_on = 1;}if ($preamble_len == 0) {$curline =~ s/.*>//;}$preamble[$preamble_len] = $curline;$preamble_len++;}elsif ($main_on) {# If it is a language remember itif ($curline =~ /\[\[[^W][^P]\:[^\]]*\]\]/) {$lang[$lang_len] = $curline;$lang_len++;}# If it is a recently added article image remember itif ($curline =~ /colspan=2.*\[\[Image:(.*)\]\]/) {$new_articles_image = $1;}# If it is a major heading add it to the major headingsif ($curline =~ /<div style="padding:[^>]*>([^<]*)<\/div>/) {$major += 1;$realpart = $1;$imagpart = $1;$realpart =~ s/\[\[.*\]\]//;$realpart =~ s/'''//g;$imagpart =~ s/[^\]]*$//;$major_text[$major] = $realpart;$major_icon[$major] = $imagpart;}# If it is a heading add it to the headingsif ($curline =~ /<div class="NavHead"[^>]*>([^<]*)<\/div>/) {$level += 1;$sublevel = 0;$headings_len += 1;$subheadings_len[$level] = 0;$sound = 1;$realpart = $1;$imagpart = $1;$realpart =~ s/\[\[.*\]\]//;$imagpart =~ s/[^\]]*$//;$headings[$level][$sublevel] = $realpart;$headings_icon[$level] = $imagpart;$headings_major[$level] = $major;}# If it is a subheading add it to the headings and start counting articlesif ($curline =~ /=====(.*)=====$/) {$sublevel += 1;$subheadings_len[$level] += 1;$headings[$level][$sublevel] = $1;$start = 1;}# If it is an div stop counting articlesif ($curline =~ /\/div/) {$start = 0;}# If it is an article add it to the articles listif ($start && $curline =~ /\[\[[^\]]*\]\]/) {$searchstr = $curline;$searchstr =~ s/.*\[\[([^\]]*)\]\].*\n$/\1/;if ($curline =~ /.*\[\[[^\]]*\]\].*<!--.*-->.*\n$/) {$commentstr = $curline;$commentstr =~ s/.*\[\[[^\]]*\]\].*<!--\ *(.*)\ *-->.*\n$/\1/;$commentstr =~ s/\ +$//;$articles[$articles_len]{"comment"} = $commentstr;}if ($curline =~ /^\ *\'\'/) { $articles[$articles_len]{"italic"} = 1; }else { $articles[$articles_len]{"italic"} = 0; }if ($curline =~ /^\ *&quot/ || $curline =~ /^\ *\"/) { $articles[$articles_len]{"quote"} = 1; }else { $articles[$articles_len]{"quote"} = 0; }$articles[$articles_len]{"name"} = $searchstr;$articles[$articles_len]{"level"} = $level;$articles[$articles_len]{"sublevel"} = $sublevel;$articles[$articles_len]{"verified"} = 0;$articles[$articles_len]{"multi"} = 0;$articles[$articles_len]{"time"} = time();$articles_len += 1;}}else {if ($curline =~ /textarea/) {$preamble_on = 1;}}}# Check download workedif ($articles_len == 0) {print "Download of good article list failed.\n";exit(1);}# Sort the articles list@articles = sort basicsort @articles;# Check for multiple entries$narticles[0] = $articles[0];$narticles_len = 1;for ($i = 1; $i < $articles_len; $i++) {if (basiccmp($articles[$i], $articles[$i - 1]) == 0) {$narticles[$narticles_len - 1]{"multi"} = 1;$narticles[$narticles_len - 1]{"sec_level"} = $articles[$i]{"level"};$narticles[$narticles_len - 1]{"sec_sublevel"} = $articles[$i]{"sublevel"};}else {$narticles[$narticles_len] = $articles[$i];$narticles_len++;}}@articles = @narticles;$articles_len = $narticles_len;# Go through each of the category files$cat_articles_len = 0;$next = "http://en.wikipedia.org/wiki/Category:Wikipedia_good_articles";for ($i = 1; $next != -1; $i++) {# Download the category fileif ($DOWNLOAD) {system "curl \"$next\" > input_cat$i.html";}# Read the category fileundef @input;open(FILE, "input_cat$i.html");@input = <FILE>;close(FILE);$input_len = $#input + 1;$next = -1;# Go through each line of the category filefor ($j = 0; $j < $input_len; $j++) {# Get the current line$curline = $input[$j];$curline =~ s/&/&/g;# If it is an article add it to the category articles listdo {$run = 0;if ($curline =~ />Talk:([^<]*)</) {$cat_articles[$cat_articles_len]{"name"} = $1;$cat_articles_len += 1;$run = 1;$curline =~ s/>Talk:([^<]*)<//;}} while ($run);# Find the next category fileif ($curline =~ /<a.*href=\"([^\"]*)\"[^>]*>next 200/) {$next = "http://en.wikipedia.org".$1;}}}# Check download workedif ($cat_articles_len == 0) {print "Download of good article category failed.\n";exit(1);}# Print the headings to fileopen(FILE, ">output_headings.txt");for ($i = 0; $i < $headings_len; $i++) {for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) {if ($j == 0) {print FILE $i.".0    ".$headings[$i][$j]."\n";}else {print FILE "  ".$i.".".$j."  ".$headings[$i][$j]."\n";}}}close(FILE);# Sort category articles list@cat_articles = sort basicsort @cat_articles;$orig = 0;# Go through each of the category articlesfor ($j = 0; $j < $cat_articles_len; $j++)  {# Search the articles list for the current category article$found_index = -1;if (basiccmp($articles[$orig], $cat_articles[$j]) == 0) {$found_index = $orig;$orig = ($orig + 1) % $articles_len;}else {for ($i = $orig + 1; $i != $orig && $found_index == -1; $i = ($i + 1) % $articles_len) {if (basiccmp($articles[$i], $cat_articles[$j]) == 0) {$found_index = $i;$orig = $i + 1;}}}# If an article is found mark it verified otherwise add a new article to the listif ($found_index != -1) {$articles[$found_index]{"verified"} = 1;$name_lower = 0;if (substr($articles[$found_index]{"name"}, 0, 1) ne substr($cat_articles[$j]{"name"}, 0, 1)) {$name_lower = 1;}$articles[$found_index]{"name"} =~ s/[^|]*/$cat_articles[$j]{"name"}/;if ($name_lower) {$articles[$found_index]{"name"} = lcfirst($articles[$found_index]{"name"});}}else {$articles[$articles_len]{"name"} = $cat_articles[$j]{"name"};print "Article not found: ".$cat_articles[$j]{"name"}."\n";$done = 0;do {print "Which level do you what to assign it to? (t for list, n for ignore)\n";$in = <STDIN>;chomp($in);$in = lc($in);if ($in eq "w") {if ($ADVANCED) {open(FILE, "output_headings.txt");@input = <FILE>;foreach $line (@input) { print $line; }close(FILE);$artname = $cat_articles[$j]{"name"};$artname =~ s/\"//g;$artname =~ s/ /_/g;`$WEBBROWSER "http:https://www.search.com.vn/wiki/index.php?lang=en&q=$artname"`;}}elsif ($in eq "t") {open(FILE, "output_headings.txt");@input = <FILE>;foreach $line (@input) { print $line; }close(FILE);}elsif ($in eq "exit" || $in eq "q") {exit(1);}elsif ($in eq "n") {$done = 1;}else {$articles[$articles_len]{"level"} = $in;$done = 1;}} while (!$done);if (!($in eq "n")) {print "Which sublevel do you what to assign it to?\n";$articles[$articles_len]{"sublevel"} = <STDIN>;$articles[$articles_len]{"verified"} = 1;$articles[$articles_len]{"multi"} = 0;$articles[$articles_len]{"time"} = time();$articles_len++;}}}# Open the time stampsopen(FILE, "stamp.time");@input = <FILE>;close(FILE);$input_len = $#input + 1;$orig = 0;for ($i = 0; $i < $input_len; $i++) {# Get the current line$curline = $input[$i];$curline =~ s/&/&/g;$curline =~ s/</</g;$curline =~ s/>/>/g;$curline =~ s/"/\"/g;# Fill out the stamp$curline =~ s/\[\[(.*)\]\]//;$stamp[0]{"name"} = $1;$stamp[0]{"time"} = int($curline);# Search the articles list for a match$found_index = -1;if (basiccmp($articles[$orig], $stamp[0]) == 0) {$found_index = $orig;$orig = ($orig + 1) % $articles_len;}else {for ($j = $orig + 1; $j != $orig && $found_index == -1; $j = ($j + 1) % $articles_len) {if (basiccmp($articles[$j], $stamp[0]) == 0) {$found_index = $j;$orig = ($j + 1) % $articles_len;}}}# Assign the time stampif ($found_index != -1) {$articles[$found_index]{"time"} = $stamp[0]{"time"};}}# Find the new articlesopen(FILE, ">stamp.time");$new_articles_count = 0;@articles = sort timesort @articles;for ($i = 0; $i < $articles_len; $i++) {if ($articles[$i]{"verified"}) {if ($new_articles_count < $NEWARTICLES && $articles[$i]{"time"} != -1) {$new_articles[$new_articles_count] = $articles[$i];$new_articles_count++;}else {$articles[$i]{"time"} = -1;}print FILE "[[".$articles[$i]{"name"}."]] ".$articles[$i]{"time"}."\n";}}close(FILE);@new_articles = sort titlesort @new_articles;# Sort the articles again@articles = sort basicsort @articles;# Open the output fileopen(FILE, ">output.txt");# Print out preamblefor ($i = 0; $i < $preamble_len; $i++) {print FILE $preamble[$i];}# Print the recently added articlesprint FILE "|-\n| colspan=2 width=\"100%\" style=\"padding:1em 1em 1em 1em; border:1px solid #dfdfdf; background-color:#E0EDFA\"  valign=\"top\" align=\"center\"|";if ($new_articles_image) {print FILE "[[Image:".$new_articles_image."]]";}print FILE "\n'''Recently listed good articles'''\n\n";$pre = 0;for ($i = 0; $i < $new_articles_count; $i++) {if ($pre) { print FILE " —\n"; }if ($new_articles[$i]{"quote"}) { print FILE ""[[".$new_articles[$i]{"name"}."]]""; }elsif ($new_articles[$i]{"italic"}) { print FILE "''[[".$new_articles[$i]{"name"}."]]''"; }else { print FILE "[[".$new_articles[$i]{"name"}."]]"; }$pre = 1;}print FILE "\n|}\n\n__NOTOC__\n";print FILE "<div style=\"clear:both;\">\n";print FILE "<!-- DO NOT REMOVE THIS DIV, USED TO FORCE IE TO DISPLAY BACKGROUND FOR ARTS DIV -->\n";print FILE "</div>\n";# Go through each heading and subheading$article_count = 0;$major = -1;for ($i = 0; $i < $headings_len; $i++) {# Print out major headingif ($headings_major[$i] > $major) {$major = $headings_major[$i];if ($major > 0) {print FILE "</div>\n</div>\n";}print FILE "<div style=\"clear:both;\">\n";print FILE "<span id=\"$major_text[$major]\" />\n";print FILE "<div style=\"padding:5px 5px 8px 5px; background-color:#CCCCFF; text-align:left; font-size:larger;\">$major_icon[$major]'''$major_text[$major]'''</div>\n";print FILE "<div style=\"text-align:left;\">\n";}for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) {# Write the heading or subheadingif ($j == 0) {if ($i != 0) {print FILE "</div>\n";print FILE "</div>\n";print FILE "\n";}print FILE "<div style=\"clear:both;\" class=\"NavFrame\">\n";print FILE "<div class=\"NavHead\" style=\"padding:2px 2px 2px 30px; background-color:#FFFAF0; text-align:left; font-size:larger;\">$headings_icon[$i]$headings[$i][$j]</div>\n";print FILE "<div class=\"NavContent\" style=\"text-align:left;\">\n";if ($SECTIONCOMMENTS) {print FILE "==<!--$headings[$i][$j]-->­ ==\n";}else {print FILE "==­ ==\n";}}else {print FILE "\n=====".$headings[$i][$j]."=====\n";}# Run through the articles adding them if they belong to the current levelundef @cur_articles;$cur_articles_len = 0;$article_count = 0;for ($k = 0; $k < $articles_len; $k++) {if ($articles[$k]{"level"} == $i && $articles[$k]{"sublevel"} == $j) {if ($articles[$k]{"verified"}) {$cur_articles[$article_count] = $articles[$k];$article_count++;$total_count++;}else {if ($REMOVED) {print "REMOVED ARTICLE: ".$articles[$k]{"name"}."\n";}}}elsif ($articles[$k]{"multi"} == 1 && $articles[$k]{"sec_level"} == $i && $articles[$k]{"sec_sublevel"} == $j) {if ($articles[$k]{"verified"}) {$cur_articles[$article_count] = $articles[$k];$article_count++;}}}# Then sort and print the articlesif ($article_count > 0) {@cur_articles = sort titlesort @cur_articles;$pre = 0;for ($k = 0; $k < $article_count; $k++) {if ($pre) { print FILE " —\n"; }if ($cur_articles[$k]{"quote"}) { print FILE ""[[".$cur_articles[$k]{"name"}."]]""; }elsif ($cur_articles[$k]{"italic"}) { print FILE "''[[".$cur_articles[$k]{"name"}."]]''"; }else { print FILE "[[".$cur_articles[$k]{"name"}."]]"; }if ($cur_articles[$k]{"comment"}) { print FILE " <!-- ".$cur_articles[$k]{"comment"}." -->"; }$pre = 1;}if ($article_count == 1) {print FILE "\n<small>\x{2014} (1 article)</small>\n";}else {print FILE "\n<small>\x{2014} (".$article_count." articles)</small>\n";}}}}# Close the output fileprint FILE "</div>\n";print FILE "</div>\n\n";for ($i = 0; $i < $lang_len; $i++) {print FILE $lang[$i];}print FILE "\n";print FILE "[[Category:Wikipedia good articles|  ]]\n";close(FILE);# Reopen the output file and reprint with correct number of articlesopen(FILE, "output.txt");@input = <FILE>;close(FILE);$input_len = $#input + 1;open(FILE, ">output.txt");for ($i = 0; $i < $input_len; $i++) {$input[$i] =~ s/\[\[Wikipedia\:Good articles\/Statistics\|[0-9]*\]\]/\[\[Wikipedia\:Good articles\/Statistics\|$total_count\]\]/;$input[$i] =~ s/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ [0-9]*/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ $total_count/;print FILE $input[$i];}close(FILE);# Print out total number of articlesprint "Number of articles: ".$total_count."\n";# Open for editingif ($ADVANCED) {print "Do you want me to open your browser for editing? (y/n)\n";$in = <STDIN>;chomp($in);$in = lc($in);if ($in eq "y") {`$WEBBROWSER "http:https://www.search.com.vn/wiki/index.php?lang=en&q=Wikipedia:Good_articles&action=edit"`;`$TEXTEDITOR "output.txt"`;}}