Sort all records according to modified date and time

241 Views Asked by At

I have some problem with my code. I have 1 GB records, in which I have to sort according to date and time. Records are look like :

TYP_journal article|KEY_1926000001|AED_|TIT_A Late Eighteenth-Century Purist|TPA_|GLO_Pronouncements of George Campbell and his contemporaries which time has set aside.|AUT_Bryan, W. F.|AUS_|AFF_|RES_|IED_|TOC_|FJN_Studies in Philology|ISN_0039-3738|ESN_|PLA_Chapel Hill, NC|URL_|DAT_1926|VOL_23|ISS_|EXT_358-370|CPP_|FSN_|ISN_|PLA_|SNO_|PUB_|IBZ_|PLA_|PYR_|PAG_|DAN_|DGI_|DGY_|OFP_|OFU_|FSS_|PDF_|LIB_|INO_|FAU_|INH_|IUR_|INU_|CDT_9/15/2003 3:12:28 PM|MDT_5/16/2017 9:18:40 AM|

I sort these records using MDT_5/16/2017 9:18:40 AM.

I used below technique:

  1. I filter file, which have MDT_ or not (create two file with MDT_ and without MDT_).

    For MDT data code:

    open read_file, '<:encoding(UTF-8)', "$current_in/$file_name" || die "file found $!";
    my @Dt_ModifiedDate = grep { $_ =~ /MDT_([0-9]+)\/([0-9]+)\/([0-9]+) ([0-9]+):([0-9]+):([0-9]+) ([A-Z]+)/i} <read_file>;
    my $doc_MD = new IO::File(">$current_ou/output/$file_name_with_out_ext.ModifiedDate");
    $doc_MD->binmode(':utf8');
    print $doc_MD @Dt_ModifiedDate;
    $doc_MD->close;
    close (read_file);
    

    For Un_MDT data code:

    open read_file, '<:encoding(UTF-8)', "$current_in/$file_name" || die "file found $!";
    my @un_ModifiedDate = grep { $_ !~ /MDT_([0-9]+)\/([0-9]+)\/([0-9]+) ([0-9]+):([0-9]+):([0-9]+) ([A-Z]+)/} <read_file>;
    open read_file, '<:encoding(UTF-8)', "$current_in/$file_name" || die "file found $!";
    my $doc_UMD = new IO::File(">$current_ou/output/$file_name_with_out_ext.unModifiedDate");
    $doc_UMD->binmode(':utf8');
    print $doc_UMD @un_ModifiedDate;
    $doc_UMD->close;
    close (read_file);
    
  2. From MDT_ contains file, I collect all date and time and sort them and then unique.

    @modi_date = map $_->[0],
    sort { uc($a->[1]) cmp uc($b->[1]) } map { [ $_, toISO8601($_) ] } @modi_date;
    @modi_date = reverse (@modi_date);
    @modi_date = uniq (@modi_date);
    
  3. according to sorted date and time I grep all records from MDT_file. And finally create final file.

    my $doc1 = new IO::File(">$current_ou/output/$file_name_with_out_ext.sorted_data");
    $doc1->binmode(':utf8');
    foreach my $changes (@modi_date)
    {
    chomp($changes);
    $Count_pro++;
    @ab = grep (/$changes/, @all_data_with_time);
    print $doc1 ("@ab\n");
    $progress_bar->update($Count_pro);
    }
    $doc1->close;
    

But this process take more time. Is there any way to do in short time?

2

There are 2 best solutions below

1
bytepusher On

As you pointed out doing everything in memory is not an option on your machine. However, I do not see why you are first sorting the dates, to then grep all records with that date, instead of sorting all of those records on the date.

I also suspect that if you were to go through the original file line by line and not in one huge map sort split map, you might save some memory, but I'll leave that up to you to try - it would save you creating the files and then re-parsing things.

I would suggest doing 2 + 3 in one go:

Skip building @modi_date ( somewhere not visible to us :/ ).

my $mdt_fn = 'with_mdt.txt'; # <- whatever name you gave that file?
open ( my $fh, '< :encoding(UTF-8)', $mdt_fn ) 
    or die "could not open file '$mdt_fn' to read: $!"; 

my $dt_parser = DateTime::Format::Strptime->new(
   pattern => '%m/%d/%Y %r',
);

# get all records from file. To ensure we only need to parse the line once,
# store the datetime in a hashref.
my @records;
while ( my $line = <$fh> ){
    push @records, {
        dt     => _dt_from_record($line),
        record => $line,
    };
}

# If you wanted to CMP rather than doing datetime comparison,
# adapt _dt_from_record and use 'cmp' instead of '<=>'
@records = sort{ $a->{dt} <=> $b->{dt} }@records;

open ( my $out_fh, '> :encoding(UTF-8)', 'sorted.txt') or 
    die "could not open file to write to: $!";

# Or reverse first if you want latest to oldest
print $out_fh $_->{record}."\n" for @records;
close $out_fh;

# I prefer using DateTime for this.
# Using a parser will alert me if some date was set, but cannot be parsed.
# If you want to spare yourself some additional time,
# why not store the parsed date in the file. However, I doubt this takes long.

sub _dt_from_record {

    my $record = shift;
    $record =~ /MDT_([^\|]+)/;
    return $dt_parser->parse_datetime($1);

}
0
Nikhil Ranjan On

Finally i done it. Complete code is :-

use warnings;
use strict;
use 5.010;
use Cwd;
binmode STDOUT, ":utf8";
use Date::Simple ('date', 'today');
use Time::Simple;
use Encode;
use Time::Piece;
use Win32::Console::ANSI;
use Term::ANSIScreen qw/:color /;
use File::Copy;

BEGIN {our $start_run = time();
    my $Start = localtime;
    print colored ['bold green'], ("\nstart time :- $Start\n");
}
##vairable
my $current_dir = getcwd();
my $current_in = $ARGV[0];
my $current_ou = $ARGV[1];
my @un_ext_file;
my @un_ext_file1;
my $current_data =today();
my $time   = Time::Simple->new();
my $hour   = $time->hours;
my $minute = $time->minutes;
my $second = $time->seconds;
my $current_time = "$hour"."-"."$minute"."-"."$second";
my $ren_folder = "output_"."$current_data"."_"."$current_time";

##check for output name DIR
opendir(DIR1, $current_ou);
my @current_ou_folder = readdir(DIR1);
closedir(DIR1);
foreach my $entry (@current_ou_folder)
{
    if ($entry eq "output")
    {
        move "$current_ou/output" , "$current_ou/$ren_folder";
        mkdir "$current_ou/output";
    }
    else
    {
        mkdir "$current_ou/output";
    }
}

opendir(DIR, $current_in);
my @files_and_folder = readdir(DIR);
closedir(DIR);
foreach my $entry (@files_and_folder)
{
    next if $entry eq '.' or $entry eq '..';
    next if -d $entry;
    push(@un_ext_file1, $entry);
}

##### check duplicate file name
my %seen;
my @file_test;
foreach my $file_name (@un_ext_file1)
{
    if ($file_name =~ /(.*)\.([a-z]+)$/)
    {
        push (@file_test, $1);
    }
    else
    {
        push (@file_test, $file_name);
    }
}
foreach my $string (@file_test)
{
    next unless $seen{$string}++;
    print "'$string' is duplicated.\n";
}

##collect all file from array
foreach my $file_name (@un_ext_file1)
{
    my $REC_counter=0;
    if ($file_name =~ /(.*)\.([a-z]+)$/)               #####work for all extension
    {
        my $file_name_with_out_ext = $1;
        my @modi_date_not_found;
        eval{
        #####read source file

        #####First short file date wise (old date appear first then new date apper in last)
        ##### To get modifiedDate from the file
        open read_file, '<:encoding(UTF-8)', "$current_in/$file_name" || die "file found $!";
        my @Dt_ModifiedDate = grep { $_ =~ /MDT_([0-9]+)\/([0-9]+)\/([0-9]+) ([0-9]+):([0-9]+):([0-9]+) ([A-Z]+)/i} <read_file>;
        my $doc_MD = new IO::File(">$current_ou/output/$file_name_with_out_ext.ModifiedDate");
        $doc_MD->binmode(':utf8');
        print $doc_MD @Dt_ModifiedDate;
        $doc_MD->close;
        close (read_file);
        @Dt_ModifiedDate=undef;  ##### free after use
        print colored ['bold green'], ("\n\tAll ModifiedDate data Filtered\n\n");

        ##### To get un-modifiedDate from the file
        open read_file, '<:encoding(UTF-8)', "$current_in/$file_name" || die "file found $!";
        my @un_ModifiedDate = grep { $_ !~ /MDT_([0-9]+)\/([0-9]+)\/([0-9]+) ([0-9]+):([0-9]+):([0-9]+) ([A-Z]+)/} <read_file>;
        my $doc_UMD = new IO::File(">$current_ou/output/$file_name_with_out_ext.unModifiedDate");
        $doc_UMD->binmode(':utf8');
        print $doc_UMD @un_ModifiedDate;
        $doc_UMD->close;
        close (read_file);
        @un_ModifiedDate=undef;  ##### free after use
        print colored ['bold green'], ("\n\tAll unModifiedDate data Filtered\n\n\n\n");

        ##### Read ModifiedDate
        open read_file_ModifiedDate, '<:encoding(UTF-8)', "$current_ou/output/$file_name_with_out_ext.ModifiedDate" || die "file found $!";
        my @all_ModifiedDate = <read_file_ModifiedDate>;
        close(read_file_ModifiedDate);

        ##### write in sotred_data file ModifiedDate after sorting all data.
        my $doc1 = new IO::File(">$current_ou/output/$file_name_with_out_ext.sorted_data");
        $doc1->binmode(':utf8');
        print $doc1 sort { (toISO8601($a)) cmp (toISO8601($b)) } @all_ModifiedDate;
        $doc1->close;

        ##### Read sorted_data and do in reverse order and then read unModifiedDate data and write in final file.
        open read_file_ModifiedDate, '<:encoding(UTF-8)', "$current_ou/output/$file_name_with_out_ext.sorted_data" || die "file found $!";
        my @all_sorted_data = <read_file_ModifiedDate>;
        close(read_file_ModifiedDate);
        @all_sorted_data = reverse (@all_sorted_data);

        open read_file_ModifiedDate, '<:encoding(UTF-8)', "$current_ou/output/$file_name_with_out_ext.unModifiedDate" || die "file found $!";
        my @all_unModifiedDate = <read_file_ModifiedDate>;
        close(read_file_ModifiedDate);

        my $doc_final = new IO::File(">$current_ou/output/$1.txt");
        $doc_final->binmode(':utf8');
        print $doc_final @all_sorted_data;
        print $doc_final @all_unModifiedDate;
        $doc_final->close;

        unlink("$current_ou/output/$file_name_with_out_ext.ModifiedDate");
        unlink("$current_ou/output/$file_name_with_out_ext.sorted_data");
        unlink("$current_ou/output/$file_name_with_out_ext.unModifiedDate");
        }
    }
}

#####Process Complete.
say "\n\n---------------------------------------------";
print colored ['bold green'], ("\tProcess Completed\n");
say "---------------------------------------------\n";

get_time();

sub toISO8601
{
    my $record = shift;
    $record =~ /MDT_([^\|]+)/;
    return(Time::Piece->strptime($1, '%m/%d/%Y %I:%M:%S %p')->datetime);
}

sub get_time
{
    my $end_run = time();
    my $run_time = $end_run - our $start_run;
    #my $days = int($sec/(24*60*60));
    my $hours = ($run_time/(60*60))%24;
    my $mins =($run_time/60)%60;
    my $secs = $run_time%60;

    print "\nJob took";
    print colored ['bold green'], (" $hours:$mins:$secs ");
    print "to complete this process\n";

    my $End = localtime;
    print colored ['bold green'], ("\nEnd time :- $End\n");
}

All process is done with-in :-- 20 min.

specially i am V. very thank-full to @bytepusher.