SkÃ¥ne Sjælland Linux User Group - http://www.sslug.dk Forside   Tilmelding   Postarkiv   Forum   Kalender   Søg
MhonArc Dato: [Date Prev] [Kronologisk oversigt] [Date Next]   TrÃ¥d: [Date Prev] [Oversigt trÃ¥de] [Date Next]   MhonArc
 

Re: [PERL] POST/GET http push modul?



On Tue, 4 Dec 2001, Carsten Svaneborg wrote:

> Jeg vil lave et perl script der fodret med en liste
> af EPxxxxxx og WOyyyyyyy patenter udspytter en html

Måske kan min patent-fetcher være til inspiration.

/Ole

#!/usr/bin/perl

# NAME
#       patentfetcher - fetch a patent
#
# SYNOPSIS
#       patentfetcher patentnumber
#
# DESCRIPTION
#       Fetches the given patent from espacenet.com (EPO) and
#       returns all the pages into one single PDF-file,
#       <patentnumber>.pdf
#
# BUGS
#       The patentnumber must include zeros and suffix. A2 seems to
#       make sense. E.g. EP0947922A2. To find the exact you may want
#       to locate the patent on espacenet.com and then use patentfetcher
#       to download it.
#
#       The resulting PDF-file is huge eventhough the fetched files
#       are much smaller. It seems to be a bug in ghostscript.
#
# LICENSE
#       GPL
#
# COPYRIGHT
#       Copyright (C) 2001-07-13  Ole Tange <sslug@sslug>



# $patent_no="EP0947922";
$patent_no=shift;
$patent_no=~/^[A-Z0-9]+$/ or die "Patent must be something like EP0947922";

#fetch_html($patent_no);

# Patent_id is either EP0947922A1 or EP0947922A2

#  GET /dips/bnsviewnav?CY=ep&LG=en&DB=epd&PN=US6101480&ID=US++
#  +6101480A1+I+&PG=2&RETR=NOK&TOTPG=18&NOTPG=1&DESPG=11&CLAPG=
#  17&DRAPG=2&REPPG=0
#  GET /dips/bns.pdf?PN=US6101480&ID=US+++6101480A1+I+&PG=2

for $try_this_id ($patent_no."A1", $patent_no."A2",
		  $patent_no."B1") {
    if(@pagefile=fetch_patent_id($try_this_id)) {
	concat_pdf_files(); $OK=1; last;
    }
}

if(not $OK) {
    print "$patent_no not found\n";
}

sub fetch_html {
    my $patent_no=shift;

    $baseoutfile=$patent_no;
    # Biblio
    # http://l2.espacenet.com/dips/viewer?PN=WO0115026&CY=ep&LG=en&DB=EPD
    $url='http://l2.espacenet.com/dips/viewer?PN='.$patent_no."&CY=ep&LG=en&DB=EPD";
    $content{biblio}=get_url($url);
    open(FILE,">$baseoutfile.biblio.html") || die;
    print FILE $content{biblio};
    close FILE;

    # Description
    # http://l2.espacenet.com/dips/desc?LG=en&CY=ep&DB=EPD&PNP=WO0115026&PN=WO0115026&FTDB=WO1
    $url='http://l2.espacenet.com/dips/desc?LG=en&CY=ep&DB=EPD&PNP='.
	$patent_no.'&PN='.$patent_no.'&FTDB=WO1';
    $content{description}=get_url($url);
    open(FILE,">$baseoutfile.description.html") || die;
    print FILE $content{description};
    close FILE;

    # Claims
    # http://l2.espacenet.com/dips/claims?LG=en&CY=ep&DB=EPD&PNP=WO0115026&PN=WO0115026&FTDB=WO1
    $url='http://l2.espacenet.com/dips/claims?LG=en&CY=ep&DB=EPD&PNP='.
	$patent_no.'&PN='.$patent_no.'&FTDB=WO1';
    $content{claims}=get_url($url);
    open(FILE,">$baseoutfile.claims.html") || die;
    print FILE $content{claims};
    close FILE;

    # Drawings (nr 2)
    # http://l2.espacenet.com/dips/drawing.pdf?PNP=WO0115026&CURDRAW=2&DRDB=WO1
    for ($page=1; $page < 99; $page++) # max 99 pages
    {
	$url='http://l2.espacenet.com/dips/drawing.pdf?PNP='.$patent_no.'&CURDRAW='.$page.'&DRDB=WO1';
	$content{$page}=get_url($url);
	get_error($content{$page}) and last;

	$pagefile="$baseoutfile.drawing.$page";
	open(FILE,">$pagefile") || die;
	print FILE $content{$page};
	close FILE;
	push @pagefile, $pagefile;
	push @to_be_removed, $pagefile;
    }
}


sub fetch_patent_id {
    my $patent_id = shift;

    # EP0947922A2 => EP+++0947922A2+I+
    $patent_id=~s/^(..)/$1+++/;
    $patent_id=~s/$/+I+/;

    $baseurl='http://l2.espacenet.com/dips/bns.pdf?ID='. $patent_id . '&PG=';
    $baseoutfile=$patent_no;

    for ($page=1; $page < 99; $page++) # max 99 pages
    {
	$content{$page}=get_url($baseurl.$page);
	get_error($content{$page}) and last;

	$pagefile="$baseoutfile.$page";
	open(FILE,">$pagefile") || die;
	print FILE $content{$page};
	close FILE;
	push @pagefile, $pagefile;
	push @to_be_removed, $pagefile;
    }
    return @pagefile;
}

sub concat_pdf_files {
    # Concat the pdf-files
    $startpage=shift @pagefile;
    while (@pagefile) {
	# concat only 10 files at a time to work around bug in ghostscript
	@tenfirst = splice(@pagefile,0,10); # 10 * shift
	$filenr++;
	$tempfile=$baseoutfile . "concat". $filenr;
	print `gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=$tempfile -c save pop -f $startpage @tenfirst`;
	$startpage=$tempfile;
	push @to_be_removed, $tempfile;
    }
    print `mv $startpage $baseoutfile.pdf`;
    unlink @to_be_removed;
}

sub get_url {
    my $url = shift;
    my $content;

    $content=`lynx -source '$url'`;
}

sub get_error {
    my $content=shift;

    return
	($content=~m:<P>Service is temporarily unavailable</P>:
	 and
	 $content=~m:Use the BACK button to resume your processing:);
}



 
Forside   Tilmelding   Postarkiv   Oversigt   Kalender   Søg

 
 
Henvendelse vedrørende websiderne til <www_admin>. Senest ændret 2005-08-10, klokken 19:54
Denne side vedligeholdes af MHonArc .