Paste from MS-Word to any browser-based HTML editor

Question

Paste from MS-Word to any browser-based HTML editor

Well, I can't be the only one with this problem, which seems to go on forever and always.

We use browser-based html editors (several different ones) - each of them has its own button "insert from a word", which works fine.

However, many of our users are simply inserted directly from the word into the design field. For us, this leads to the end of the world - sometimes it breaks javascript or other things too numerous to mention.

So, I decided to write a simple function that looks for the contents of the source code, and if it sees any bad characters associated with the palette in Microsoft Word, to discard the match “it looks like ms groove words”. Currently it looks like this:

// detect potentially bad characters - usually from msword paste function hasInvalidChars ( in_element ) { var src = $j(in_element).val(); var e = $E(src); // enhanced string var bad = Array( "mso-list:", "class=\"Mso", "</o:p>", "[if !supportLists]", "style=\"mso-", "mso-bidi", """, """, "<v:shapetype", "<v:path", "file:///" ); for ( i=0; i< bad.length; i++ ) { if ( e.contains(bad[i]) ) { return true; } } return false; }

Please note that if you try to run the code, this will not work, because (1) I use jQuery and (2) I have a special object ($ E) that adds a bunch of material to the string, the 'contains ()' function, but you get an idea of what she is doing.

I am looking for array elements that refer to the "bad []" array. I came up with a tentative list (which may or may not be a good starting point), but I ask you experts, please, can you tell me which characters or phrases you put here? At the moment, if I could catch 80% of the questions, I would be delighted.

Thanks.

+6

javascript html ms-word tinymce

Onenerd May 03 '11 at 16:59

source share

2 answers

Zeno · Answer 1 · 2011-05-03T18:54:37+0000

I recently searched for similar things after bit-googling. I found a nice feature. It contains most bad tags. here is the link that contains this function:

Javascript function

 <script type="text/javascript" runat="server" language="javascript"> function CleanWordHTML( str ) { str = str.replace(/<o:p>\s*<\/o:p>/g, "") ; str = str.replace(/<o:p>.*?<\/o:p>/g, "&nbsp;") ; str = str.replace( /\s*mso-[^:]+:[^;"]+;?/gi, "" ) ; str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*;/gi, "" ) ; str = str.replace( /\s*MARGIN: 0cm 0cm 0pt\s*"/gi, "\"" ) ; str = str.replace( /\s*TEXT-INDENT: 0cm\s*;/gi, "" ) ; str = str.replace( /\s*TEXT-INDENT: 0cm\s*"/gi, "\"" ) ; str = str.replace( /\s*TEXT-ALIGN: [^\s;]+;?"/gi, "\"" ) ; str = str.replace( /\s*PAGE-BREAK-BEFORE: [^\s;]+;?"/gi, "\"" ) ; str = str.replace( /\s*FONT-VARIANT: [^\s;]+;?"/gi, "\"" ) ; str = str.replace( /\s*tab-stops:[^;"]*;?/gi, "" ) ; str = str.replace( /\s*tab-stops:[^"]*/gi, "" ) ; str = str.replace( /\s*face="[^"]*"/gi, "" ) ; str = str.replace( /\s*face=[^ >]*/gi, "" ) ; str = str.replace( /\s*FONT-FAMILY:[^;"]*;?/gi, "" ) ; str = str.replace(/<(\w[^>]*) class=([^ |>]*)([^>]*)/gi, "<$1$3") ; str = str.replace( /<(\w[^>]*) style="([^\"]*)"([^>]*)/gi, "<$1$3" ) ; str = str.replace( /\s*style="\s*"/gi, '' ) ; str = str.replace( /<SPAN\s*[^>]*>\s*&nbsp;\s*<\/SPAN>/gi, '&nbsp;' ) ; str = str.replace( /<SPAN\s*[^>]*><\/SPAN>/gi, '' ) ; str = str.replace(/<(\w[^>]*) lang=([^ |>]*)([^>]*)/gi, "<$1$3") ; str = str.replace( /<SPAN\s*>(.*?)<\/SPAN>/gi, '$1' ) ; str = str.replace( /<FONT\s*>(.*?)<\/FONT>/gi, '$1' ) ; str = str.replace(/<\\?\?xml[^>]*>/gi, "") ; str = str.replace(/<\/?\w+:[^>]*>/gi, "") ; str = str.replace( /<H\d>\s*<\/H\d>/gi, '' ) ; str = str.replace( /<H1([^>]*)>/gi, '' ) ; str = str.replace( /<H2([^>]*)>/gi, '' ) ; str = str.replace( /<H3([^>]*)>/gi, '' ) ; str = str.replace( /<H4([^>]*)>/gi, '' ) ; str = str.replace( /<H5([^>]*)>/gi, '' ) ; str = str.replace( /<H6([^>]*)>/gi, '' ) ; str = str.replace( /<\/H\d>/gi, '<br>' ) ; //remove this to take out breaks where Heading tags were str = str.replace( /<(U|I|STRIKE)>&nbsp;<\/\1>/g, '&nbsp;' ) ; str = str.replace( /<(B|b)>&nbsp;<\/\b|B>/g, '' ) ; str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ; str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ; str = str.replace( /<([^\s>]+)[^>]*>\s*<\/\1>/g, '' ) ; //some RegEx code for the picky browsers var re = new RegExp("(<P)([^>]*>.*?)(<\/P>)","gi") ; str = str.replace( re, "<div$2</div>" ) ; var re2 = new RegExp("(<font|<FONT)([^*>]*>.*?)(<\/FONT>|<\/font>)","gi") ; str = str.replace( re2, "<div$2</div>") ; str = str.replace( /size|SIZE = ([\d]{1})/g, '' ) ; return str ; } </script>

http://www.1stclassmedia.co.uk/developers/clean-ms-word-formatting.php

all loans to the original author.

Thariama · Answer 2 · 2011-05-04T10:06:13+0000

Tinymce has a flag for this when pasting.

You should look at the past_preprocess parameter of the insert plugin .

Here you can access the inserted content with "o" and find out if the insert was received from Word. Example:

 paste_preprocess : function(pl, o) { //if(console) console.log('content', o); if (o.wordContent ) { alert('paste from WORD detected!!!'); } ... },

I use a special function to get rid of unwanted tags (I was not so happy by default how tinymce handles this, so I wrote my own).

Paste from MS-Word to any browser-based HTML editor

More articles: