|
|||||||||||||||||||||||||||
On Tue, 4 Dec 2001, Carsten Svaneborg wrote:
> Jeg vil lave et perl script der fodret med en liste
> af EPxxxxxx og WOyyyyyyy patenter udspytter en html
Måske kan min patent-fetcher være til inspiration.
/Ole
#!/usr/bin/perl
# NAME
# patentfetcher - fetch a patent
#
# SYNOPSIS
# patentfetcher patentnumber
#
# DESCRIPTION
# Fetches the given patent from espacenet.com (EPO) and
# returns all the pages into one single PDF-file,
# <patentnumber>.pdf
#
# BUGS
# The patentnumber must include zeros and suffix. A2 seems to
# make sense. E.g. EP0947922A2. To find the exact you may want
# to locate the patent on espacenet.com and then use patentfetcher
# to download it.
#
# The resulting PDF-file is huge eventhough the fetched files
# are much smaller. It seems to be a bug in ghostscript.
#
# LICENSE
# GPL
#
# COPYRIGHT
# Copyright (C) 2001-07-13 Ole Tange <sslug@sslug>
# $patent_no="EP0947922";
$patent_no=shift;
$patent_no=~/^[A-Z0-9]+$/ or die "Patent must be something like EP0947922";
#fetch_html($patent_no);
# Patent_id is either EP0947922A1 or EP0947922A2
# GET /dips/bnsviewnav?CY=ep&LG=en&DB=epd&PN=US6101480&ID=US++
# +6101480A1+I+&PG=2&RETR=NOK&TOTPG=18&NOTPG=1&DESPG=11&CLAPG=
# 17&DRAPG=2&REPPG=0
# GET /dips/bns.pdf?PN=US6101480&ID=US+++6101480A1+I+&PG=2
for $try_this_id ($patent_no."A1", $patent_no."A2",
$patent_no."B1") {
if(@pagefile=fetch_patent_id($try_this_id)) {
concat_pdf_files(); $OK=1; last;
}
}
if(not $OK) {
print "$patent_no not found\n";
}
sub fetch_html {
my $patent_no=shift;
$baseoutfile=$patent_no;
# Biblio
# http://l2.espacenet.com/dips/viewer?PN=WO0115026&CY=ep&LG=en&DB=EPD
$url='http://l2.espacenet.com/dips/viewer?PN='.$patent_no."&CY=ep&LG=en&DB=EPD";
$content{biblio}=get_url($url);
open(FILE,">$baseoutfile.biblio.html") || die;
print FILE $content{biblio};
close FILE;
# Description
# http://l2.espacenet.com/dips/desc?LG=en&CY=ep&DB=EPD&PNP=WO0115026&PN=WO0115026&FTDB=WO1
$url='http://l2.espacenet.com/dips/desc?LG=en&CY=ep&DB=EPD&PNP='.
$patent_no.'&PN='.$patent_no.'&FTDB=WO1';
$content{description}=get_url($url);
open(FILE,">$baseoutfile.description.html") || die;
print FILE $content{description};
close FILE;
# Claims
# http://l2.espacenet.com/dips/claims?LG=en&CY=ep&DB=EPD&PNP=WO0115026&PN=WO0115026&FTDB=WO1
$url='http://l2.espacenet.com/dips/claims?LG=en&CY=ep&DB=EPD&PNP='.
$patent_no.'&PN='.$patent_no.'&FTDB=WO1';
$content{claims}=get_url($url);
open(FILE,">$baseoutfile.claims.html") || die;
print FILE $content{claims};
close FILE;
# Drawings (nr 2)
# http://l2.espacenet.com/dips/drawing.pdf?PNP=WO0115026&CURDRAW=2&DRDB=WO1
for ($page=1; $page < 99; $page++) # max 99 pages
{
$url='http://l2.espacenet.com/dips/drawing.pdf?PNP='.$patent_no.'&CURDRAW='.$page.'&DRDB=WO1';
$content{$page}=get_url($url);
get_error($content{$page}) and last;
$pagefile="$baseoutfile.drawing.$page";
open(FILE,">$pagefile") || die;
print FILE $content{$page};
close FILE;
push @pagefile, $pagefile;
push @to_be_removed, $pagefile;
}
}
sub fetch_patent_id {
my $patent_id = shift;
# EP0947922A2 => EP+++0947922A2+I+
$patent_id=~s/^(..)/$1+++/;
$patent_id=~s/$/+I+/;
$baseurl='http://l2.espacenet.com/dips/bns.pdf?ID='. $patent_id . '&PG=';
$baseoutfile=$patent_no;
for ($page=1; $page < 99; $page++) # max 99 pages
{
$content{$page}=get_url($baseurl.$page);
get_error($content{$page}) and last;
$pagefile="$baseoutfile.$page";
open(FILE,">$pagefile") || die;
print FILE $content{$page};
close FILE;
push @pagefile, $pagefile;
push @to_be_removed, $pagefile;
}
return @pagefile;
}
sub concat_pdf_files {
# Concat the pdf-files
$startpage=shift @pagefile;
while (@pagefile) {
# concat only 10 files at a time to work around bug in ghostscript
@tenfirst = splice(@pagefile,0,10); # 10 * shift
$filenr++;
$tempfile=$baseoutfile . "concat". $filenr;
print `gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=$tempfile -c save pop -f $startpage @tenfirst`;
$startpage=$tempfile;
push @to_be_removed, $tempfile;
}
print `mv $startpage $baseoutfile.pdf`;
unlink @to_be_removed;
}
sub get_url {
my $url = shift;
my $content;
$content=`lynx -source '$url'`;
}
sub get_error {
my $content=shift;
return
($content=~m:<P>Service is temporarily unavailable</P>:
and
$content=~m:Use the BACK button to resume your processing:);
}
|
||||||||||||||
|
||||||||||||||