-1

I have a simple function to get a value from a HTML's ​meta property="og:image":

Function HTML_GetOGImage(HTML:String;Var URL:String):Boolean;
Var
  I,A,B,C:Integer;
Begin
  Result:=False;
  A:=PosEx('og:image',HTML,1);
  If A<>0 Then B:=PosEx('content="',HTML,A+9)Else Exit;
  If B<>0 Then C:=PosEx('"',HTML,B+9)Else Exit;
  For I:=B+9 To C-1 Do URL:=URL+HTML[I];
  Result:=True;
End;

It works well, but on some websites where the HTML is a long line of chars without carriage returns and meta is at a position higher than 1024, the function returns, nothing is done. (i.e. here).

What is the best practise to handle long Strings? Is PosEx limited in terms of a String's length?

AmigoJack
  • 5,234
  • 1
  • 15
  • 31
Stalkium
  • 148
  • 1
  • 11
  • If that's true, I'd say it is a bug. Which version of Delphi are you using? – HeartWare Jun 16 '21 at 11:44
  • Change your first `PosEx()` to `Pos()`: is the problem still the same? Also "_the function return blank_" is both ambiguous and impossible: which function? What means "blank" to you? – AmigoJack Jun 16 '21 at 11:52
  • @HeartWare 10.3, work with all pages html and for this one when it returned nothing i just manually download the HTML in a notepad and found that the og:image is at 1024 pos for O and 1025 for the G – Stalkium Jun 16 '21 at 12:00
  • @AmigoJack i mean nothing happens, even the application is not stuck ! and if i try it with another webpage it extract the image normally ... i tried with POS for the first POSEX, same result – Stalkium Jun 16 '21 at 12:03
  • If you're talking about the HTML from the page you're linking, I don't find "og_image" in column 1024, but in column 4086... – HeartWare Jun 16 '21 at 12:08
  • 1
    And I find the following URL using your exact code to scan: 'https://www.leparisien.fr/resizer/tHSUGi9wTnVSkXz4NUvbjObFPVw=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/7S5TB4X22BCGFMC6ZX7FVJXI4U.jpg' – HeartWare Jun 16 '21 at 12:11
  • 1
    Like @HeartWare, I tried your code on your page using your Delphi version. And I get https://www.leparisien.fr/resizer/tHSUGi9wTnVSkXz4NUvbjObFPVw=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/7S5TB4X22BCGFMC6ZX7FVJXI4U.jpg – Andreas Rejbrand Jun 16 '21 at 12:14
  • 2
    (Off-topic hint: Don't do `For I:=B+9 To C-1 Do URL:=URL+HTML[I];`. Use `Copy`. And use a proper HTML parser.) – Andreas Rejbrand Jun 16 '21 at 12:15
  • @HeartWare i tried several url from this site LeParisien, because they put their HTML in block without carriage return, perhaps it was another page, but basically all og:image will be found above the 1024 limit, anyway still have the bug, but i just learned now how to use TJVHTMLPARSER and it work like a charm, so do i have the close the question or let it perhaps someone find a solution ? Many thanks guys – Stalkium Jun 16 '21 at 12:17
  • @AndreasRejbrand +1 for the parser i just tried TJVHTMLPARSER and its ok, thanks – Stalkium Jun 16 '21 at 12:18
  • 2
    There's no problem with >1024. As I said, my test found it at position 4000+. Anyway, if you have found an alternative solution, that's the main thing... – HeartWare Jun 16 '21 at 12:19

2 Answers2

1

This cannot be PosEx()'s fault. One can simply counter test this by forcing a situation which you think is the culprit:

var
  s: String;
  i: Integer;
begin
  // Just a long string
  SetLength( s, 2000 );
  for i:= 1 to 2000 do s[i]:= Chr( i mod 10+ $30 );

  // The text to be found is clearly after 1024 characters
  s:= s+ 'og:image';
  i:= PosEx( 'og:image', s, 1 );

  // Should not be 0 but instead 2001
  Writeln( i );

If this code works for you and the searched text is found then your error is somewhere else. PosEx() (and Pos()) should also have no problem with much larger strings, i.e. one million characters. I think your understanding is wrong: you expect your variable HTML to have contain what you want, but have you even checked its length or content (thru saving it to a file)?

AmigoJack
  • 5,234
  • 1
  • 15
  • 31
0

PosEx() works perfectly without any problem. your question is wrong

Run this in your delphi, I don't see any problem.

Unit1.dfm

object Form1: TForm1
  Left = 0
  Top = 0
  Caption = 'Form1'
  ClientHeight = 231
  ClientWidth = 505
  Color = clBtnFace
  Font.Charset = DEFAULT_CHARSET
  Font.Color = clWindowText
  Font.Height = -11
  Font.Name = 'Tahoma'
  Font.Style = []
  OldCreateOrder = False
  OnShow = FormShow
  PixelsPerInch = 96
  TextHeight = 13
  object WebBrowser1: TWebBrowser
    Left = 0
    Top = 0
    Width = 505
    Height = 231
    Align = alClient
    TabOrder = 0
    OnDownloadComplete = WebBrowser1DownloadComplete
    ExplicitLeft = 120
    ExplicitTop = 73
    ExplicitWidth = 300
    ExplicitHeight = 150
    ControlData = {
      4C00000031340000E01700000000000000000000000000000000000000000000
      000000004C000000000000000000000001000000E0D057007335CF11AE690800
      2B2E126208000000000000004C0000000114020000000000C000000000000046
      8000000000000000000000000000000000000000000000000000000000000000
      00000000000000000100000000000000000000000000000000000000}
  end
end

unit1.pas

unit Unit1;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
      Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.OleCtrls, SHDocVw;

const
  siteURL = 'https://www.leparisien.fr/societe/sante/covid-19-fin-de-la-periode-disolement-de-jean-castex-dont-lepouse-avait-ete-testee-positive-suivez-notre-direct-16-06-2021-BKACFU43MBAATAOLEN7BYHNNIY.php';

type
  TForm1 = class(TForm)
    WebBrowser1: TWebBrowser;
    procedure FormShow(Sender: TObject);
    procedure WebBrowser1DownloadComplete(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

uses System.StrUtils, ActiveX;

function HTML_GetOGImage(HTML:String;var URL:String) : Boolean;
bar
  I,A,B,C:Integer;
begin
  Result:=False;
  A := PosEx('og:image',HTML,1);
  If A<>0 Then B:=PosEx('content="',HTML,A+9)Else Exit;
  If B<>0 Then C:=PosEx('"',HTML,B+9)Else Exit;
  URL := Copy(HTML, B+9, C-B-9); //I prefer <- instead of -> For I:=B+9 To C-1 Do URL:=URL+HTML[I];

  ShowMessage('A = ' + IntToStr(A) +#13#10+ 'B = ' + IntToStr(B) +#13#10+ 'C = ' + IntToStr(C)); //remove this line after test
  Result := True;
end;

procedure TForm1.FormShow(Sender: TObject);
begin
  WebBrowser1.Navigate(siteURL);
end;

procedure TForm1.WebBrowser1DownloadComplete(Sender: TObject);
var
  LStream: TStringStream;
  Stream : IStream;
  LPersistStreamInit : IPersistStreamInit;
  URL : String;
begin
  if not Assigned(WebBrowser1.Document) then exit;
  LStream := TStringStream.Create('');
  try
    LPersistStreamInit := WebBrowser1.Document as IPersistStreamInit;
    Stream := TStreamAdapter.Create(LStream,soReference);
    LPersistStreamInit.Save(Stream,true);
    HTML_GetOGImage(LStream.DataString, URL);
  finally
    LStream.Free();
  end;
end;

end.
AliReza
  • 106
  • 1
  • 7